transformplan 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {transformplan-0.1.3/transformplan.egg-info → transformplan-0.2.0}/PKG-INFO +2 -2
- {transformplan-0.1.3 → transformplan-0.2.0}/README.md +1 -1
- {transformplan-0.1.3 → transformplan-0.2.0}/pyproject.toml +1 -1
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_duckdb.py +125 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_math.py +164 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/base.py +11 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/duckdb.py +20 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/polars.py +19 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/chunking.py +1 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/math.py +40 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/validation.py +46 -0
- {transformplan-0.1.3 → transformplan-0.2.0/transformplan.egg-info}/PKG-INFO +2 -2
- {transformplan-0.1.3 → transformplan-0.2.0}/LICENSE +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/setup.cfg +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_chunking.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_column.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_core.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_datetime.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_filters.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_integration.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_join.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_map.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_map_encoding.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_math_scaling.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_protocol.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_rows.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_string.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_validation.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/__init__.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/__init__.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/core.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/filters.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/__init__.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/column.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/datetime.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/join.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/map.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/rows.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/string.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/plan.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/protocol.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/py.typed +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/sql_utils.py +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/SOURCES.txt +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/dependency_links.txt +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/requires.txt +0 -0
- {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: transformplan
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Safe, reproducible data transformations with built-in auditing and validation
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/limebit/transformplan
|
|
@@ -52,7 +52,7 @@ Dynamic: license-file
|
|
|
52
52
|
```python
|
|
53
53
|
from transformplan import TransformPlan, Col
|
|
54
54
|
|
|
55
|
-
# Build readable pipelines with
|
|
55
|
+
# Build readable pipelines with 89 chainable operations
|
|
56
56
|
plan = (
|
|
57
57
|
TransformPlan()
|
|
58
58
|
# Standardize column names
|
|
@@ -1617,6 +1617,131 @@ class TestMathDiffFromAgg:
|
|
|
1617
1617
|
)
|
|
1618
1618
|
|
|
1619
1619
|
|
|
1620
|
+
class TestMathDiffLag:
|
|
1621
|
+
"""Tests for math_diff_lag with DuckDB backend."""
|
|
1622
|
+
|
|
1623
|
+
def test_numeric_basic(
|
|
1624
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1625
|
+
) -> None:
|
|
1626
|
+
rel = con.sql(
|
|
1627
|
+
"SELECT * FROM (VALUES (1, 10), (2, 30), (3, 35), (4, 50)) AS t(id, val)"
|
|
1628
|
+
)
|
|
1629
|
+
result, _ = (
|
|
1630
|
+
TransformPlan()
|
|
1631
|
+
.math_diff_lag("val", order_by="id", new_column="diff")
|
|
1632
|
+
.process(rel, backend=backend)
|
|
1633
|
+
)
|
|
1634
|
+
vals = _col_values(result, "diff")
|
|
1635
|
+
assert vals[0] is None
|
|
1636
|
+
assert vals[1:] == [20, 5, 15]
|
|
1637
|
+
|
|
1638
|
+
def test_numeric_lag2(
|
|
1639
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1640
|
+
) -> None:
|
|
1641
|
+
rel = con.sql(
|
|
1642
|
+
"SELECT * FROM (VALUES (1, 10), (2, 30), (3, 35), (4, 50)) AS t(id, val)"
|
|
1643
|
+
)
|
|
1644
|
+
result, _ = (
|
|
1645
|
+
TransformPlan()
|
|
1646
|
+
.math_diff_lag("val", order_by="id", new_column="diff", lag=2)
|
|
1647
|
+
.process(rel, backend=backend)
|
|
1648
|
+
)
|
|
1649
|
+
vals = _col_values(result, "diff")
|
|
1650
|
+
assert vals[0] is None
|
|
1651
|
+
assert vals[1] is None
|
|
1652
|
+
assert vals[2:] == [25, 20]
|
|
1653
|
+
|
|
1654
|
+
def test_grouped_numeric(
|
|
1655
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1656
|
+
) -> None:
|
|
1657
|
+
rel = con.sql(
|
|
1658
|
+
"SELECT * FROM (VALUES "
|
|
1659
|
+
"('A', 1, 10), ('A', 2, 30), ('A', 3, 35), "
|
|
1660
|
+
"('B', 1, 100), ('B', 2, 150), ('B', 3, 160)"
|
|
1661
|
+
") AS t(grp, seq, val)"
|
|
1662
|
+
)
|
|
1663
|
+
result, _ = (
|
|
1664
|
+
TransformPlan()
|
|
1665
|
+
.math_diff_lag("val", order_by="seq", new_column="diff", group_by="grp")
|
|
1666
|
+
.rows_sort(["grp", "seq"])
|
|
1667
|
+
.process(rel, backend=backend)
|
|
1668
|
+
)
|
|
1669
|
+
vals = _col_values(result, "diff")
|
|
1670
|
+
assert vals == [None, 20, 5, None, 50, 10]
|
|
1671
|
+
|
|
1672
|
+
def test_datetime_column(
|
|
1673
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1674
|
+
) -> None:
|
|
1675
|
+
rel = con.sql(
|
|
1676
|
+
"SELECT * FROM (VALUES "
|
|
1677
|
+
"(1, TIMESTAMP '2024-01-01 00:00:00'), "
|
|
1678
|
+
"(2, TIMESTAMP '2024-01-01 01:00:00'), "
|
|
1679
|
+
"(3, TIMESTAMP '2024-01-01 03:00:00')"
|
|
1680
|
+
") AS t(id, ts)"
|
|
1681
|
+
)
|
|
1682
|
+
result, _ = (
|
|
1683
|
+
TransformPlan()
|
|
1684
|
+
.math_diff_lag("ts", order_by="id", new_column="gap")
|
|
1685
|
+
.process(rel, backend=backend)
|
|
1686
|
+
)
|
|
1687
|
+
vals = _col_values(result, "gap")
|
|
1688
|
+
assert vals[0] is None
|
|
1689
|
+
assert vals[1].total_seconds() == 3600
|
|
1690
|
+
assert vals[2].total_seconds() == 7200
|
|
1691
|
+
|
|
1692
|
+
def test_datetime_grouped(
|
|
1693
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1694
|
+
) -> None:
|
|
1695
|
+
rel = con.sql(
|
|
1696
|
+
"SELECT * FROM (VALUES "
|
|
1697
|
+
"('A', TIMESTAMP '2024-01-01 00:00:00'), "
|
|
1698
|
+
"('A', TIMESTAMP '2024-01-01 02:00:00'), "
|
|
1699
|
+
"('B', TIMESTAMP '2024-01-01 10:00:00'), "
|
|
1700
|
+
"('B', TIMESTAMP '2024-01-01 13:00:00')"
|
|
1701
|
+
") AS t(patient, ts)"
|
|
1702
|
+
)
|
|
1703
|
+
result, _ = (
|
|
1704
|
+
TransformPlan()
|
|
1705
|
+
.math_diff_lag("ts", order_by="ts", new_column="gap", group_by="patient")
|
|
1706
|
+
.rows_sort(["patient", "ts"])
|
|
1707
|
+
.process(rel, backend=backend)
|
|
1708
|
+
)
|
|
1709
|
+
vals = _col_values(result, "gap")
|
|
1710
|
+
assert vals[0] is None
|
|
1711
|
+
assert vals[1].total_seconds() / 3600 == 2.0
|
|
1712
|
+
assert vals[2] is None
|
|
1713
|
+
assert vals[3].total_seconds() / 3600 == 3.0
|
|
1714
|
+
|
|
1715
|
+
def test_order_by_list(
|
|
1716
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1717
|
+
) -> None:
|
|
1718
|
+
rel = con.sql(
|
|
1719
|
+
"SELECT * FROM (VALUES "
|
|
1720
|
+
"(1, 1, 10), (1, 2, 20), (2, 1, 30), (2, 2, 40)"
|
|
1721
|
+
") AS t(a, b, val)"
|
|
1722
|
+
)
|
|
1723
|
+
result, _ = (
|
|
1724
|
+
TransformPlan()
|
|
1725
|
+
.math_diff_lag("val", order_by=["a", "b"], new_column="diff")
|
|
1726
|
+
.process(rel, backend=backend)
|
|
1727
|
+
)
|
|
1728
|
+
vals = _col_values(result, "diff")
|
|
1729
|
+
assert vals == [None, 10, 10, 10]
|
|
1730
|
+
|
|
1731
|
+
def test_global_no_group(
|
|
1732
|
+
self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
|
|
1733
|
+
) -> None:
|
|
1734
|
+
rel = con.sql("SELECT * FROM (VALUES (3, 30), (1, 10), (2, 20)) AS t(seq, val)")
|
|
1735
|
+
result, _ = (
|
|
1736
|
+
TransformPlan()
|
|
1737
|
+
.math_diff_lag("val", order_by="seq", new_column="diff")
|
|
1738
|
+
.process(rel, backend=backend)
|
|
1739
|
+
)
|
|
1740
|
+
vals = _col_values(result, "diff")
|
|
1741
|
+
assert vals[0] is None
|
|
1742
|
+
assert vals[1:] == [10, 10]
|
|
1743
|
+
|
|
1744
|
+
|
|
1620
1745
|
class TestColExpr:
|
|
1621
1746
|
"""Tests for col_expr on DuckDB backend."""
|
|
1622
1747
|
|
|
@@ -465,3 +465,167 @@ class TestMathDiffFromAgg:
|
|
|
465
465
|
result1, _ = plan.process(numeric_df)
|
|
466
466
|
result2, _ = restored.process(numeric_df)
|
|
467
467
|
assert result1["diff"].to_list() == result2["diff"].to_list()
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class TestMathDiffLag:
|
|
471
|
+
"""Tests for math_diff_lag operation."""
|
|
472
|
+
|
|
473
|
+
def test_numeric_basic(self) -> None:
|
|
474
|
+
"""Test lag=1 on integers ordered by id; first row null."""
|
|
475
|
+
df = pl.DataFrame({"id": [1, 2, 3, 4], "val": [10, 30, 35, 50]})
|
|
476
|
+
plan = TransformPlan().math_diff_lag("val", order_by="id", new_column="diff")
|
|
477
|
+
result, _ = plan.process(df)
|
|
478
|
+
assert result["diff"].to_list() == [None, 20.0, 5.0, 15.0]
|
|
479
|
+
|
|
480
|
+
def test_numeric_lag2(self) -> None:
|
|
481
|
+
"""Test lag=2; first two rows null."""
|
|
482
|
+
df = pl.DataFrame({"id": [1, 2, 3, 4], "val": [10, 30, 35, 50]})
|
|
483
|
+
plan = TransformPlan().math_diff_lag(
|
|
484
|
+
"val", order_by="id", new_column="diff", lag=2
|
|
485
|
+
)
|
|
486
|
+
result, _ = plan.process(df)
|
|
487
|
+
assert result["diff"].to_list() == [None, None, 25.0, 20.0]
|
|
488
|
+
|
|
489
|
+
def test_grouped_numeric(self) -> None:
|
|
490
|
+
"""Test partition by group; nulls restart per group."""
|
|
491
|
+
df = pl.DataFrame(
|
|
492
|
+
{
|
|
493
|
+
"grp": ["A", "A", "A", "B", "B", "B"],
|
|
494
|
+
"seq": [1, 2, 3, 1, 2, 3],
|
|
495
|
+
"val": [10, 30, 35, 100, 150, 160],
|
|
496
|
+
}
|
|
497
|
+
)
|
|
498
|
+
plan = TransformPlan().math_diff_lag(
|
|
499
|
+
"val", order_by="seq", new_column="diff", group_by="grp"
|
|
500
|
+
)
|
|
501
|
+
result, _ = plan.process(df)
|
|
502
|
+
expected = [None, 20.0, 5.0, None, 50.0, 10.0]
|
|
503
|
+
assert result["diff"].to_list() == expected
|
|
504
|
+
|
|
505
|
+
def test_datetime_column(self) -> None:
|
|
506
|
+
"""Test datetime input produces duration output."""
|
|
507
|
+
df = pl.DataFrame(
|
|
508
|
+
{
|
|
509
|
+
"id": [1, 2, 3],
|
|
510
|
+
"ts": [
|
|
511
|
+
datetime(2024, 1, 1, 0, 0),
|
|
512
|
+
datetime(2024, 1, 1, 1, 0),
|
|
513
|
+
datetime(2024, 1, 1, 3, 0),
|
|
514
|
+
],
|
|
515
|
+
}
|
|
516
|
+
)
|
|
517
|
+
plan = TransformPlan().math_diff_lag("ts", order_by="id", new_column="gap")
|
|
518
|
+
result, _ = plan.process(df)
|
|
519
|
+
assert result["gap"].dtype == pl.Duration
|
|
520
|
+
vals = result["gap"].to_list()
|
|
521
|
+
assert vals[0] is None
|
|
522
|
+
assert vals[1].total_seconds() == 3600
|
|
523
|
+
assert vals[2].total_seconds() == 7200
|
|
524
|
+
|
|
525
|
+
def test_datetime_grouped(self) -> None:
|
|
526
|
+
"""Test primary use case: time between events per patient."""
|
|
527
|
+
df = pl.DataFrame(
|
|
528
|
+
{
|
|
529
|
+
"patient": ["A", "A", "B", "B"],
|
|
530
|
+
"ts": [
|
|
531
|
+
datetime(2024, 1, 1, 0, 0),
|
|
532
|
+
datetime(2024, 1, 1, 2, 0),
|
|
533
|
+
datetime(2024, 1, 1, 10, 0),
|
|
534
|
+
datetime(2024, 1, 1, 13, 0),
|
|
535
|
+
],
|
|
536
|
+
}
|
|
537
|
+
)
|
|
538
|
+
plan = TransformPlan().math_diff_lag(
|
|
539
|
+
"ts", order_by="ts", new_column="gap", group_by="patient"
|
|
540
|
+
)
|
|
541
|
+
result, _ = plan.process(df)
|
|
542
|
+
assert result["gap"].dtype == pl.Duration
|
|
543
|
+
vals = result["gap"].to_list()
|
|
544
|
+
assert vals[0] is None
|
|
545
|
+
assert vals[1].total_seconds() / 3600 == 2.0
|
|
546
|
+
assert vals[2] is None
|
|
547
|
+
assert vals[3].total_seconds() / 3600 == 3.0
|
|
548
|
+
|
|
549
|
+
def test_order_by_different_column(self) -> None:
|
|
550
|
+
"""Test diffing 'value' ordered by 'timestamp'."""
|
|
551
|
+
df = pl.DataFrame(
|
|
552
|
+
{
|
|
553
|
+
"ts": [
|
|
554
|
+
datetime(2024, 1, 1),
|
|
555
|
+
datetime(2024, 1, 2),
|
|
556
|
+
datetime(2024, 1, 3),
|
|
557
|
+
],
|
|
558
|
+
"val": [100, 130, 125],
|
|
559
|
+
}
|
|
560
|
+
)
|
|
561
|
+
plan = TransformPlan().math_diff_lag("val", order_by="ts", new_column="change")
|
|
562
|
+
result, _ = plan.process(df)
|
|
563
|
+
assert result["change"].to_list() == [None, 30.0, -5.0]
|
|
564
|
+
|
|
565
|
+
def test_order_by_list(self) -> None:
|
|
566
|
+
"""Test multi-column order_by."""
|
|
567
|
+
df = pl.DataFrame(
|
|
568
|
+
{
|
|
569
|
+
"a": [1, 1, 2, 2],
|
|
570
|
+
"b": [1, 2, 1, 2],
|
|
571
|
+
"val": [10, 20, 30, 40],
|
|
572
|
+
}
|
|
573
|
+
)
|
|
574
|
+
plan = TransformPlan().math_diff_lag(
|
|
575
|
+
"val", order_by=["a", "b"], new_column="diff"
|
|
576
|
+
)
|
|
577
|
+
result, _ = plan.process(df)
|
|
578
|
+
assert result["diff"].to_list() == [None, 10.0, 10.0, 10.0]
|
|
579
|
+
|
|
580
|
+
def test_global_no_group(self) -> None:
|
|
581
|
+
"""Test no group_by, global ordering."""
|
|
582
|
+
df = pl.DataFrame({"seq": [3, 1, 2], "val": [30, 10, 20]})
|
|
583
|
+
plan = TransformPlan().math_diff_lag("val", order_by="seq", new_column="diff")
|
|
584
|
+
result, _ = plan.process(df)
|
|
585
|
+
# After sorting by seq: [10, 20, 30], diffs: [None, 10, 10]
|
|
586
|
+
assert result["diff"].to_list() == [None, 10.0, 10.0]
|
|
587
|
+
|
|
588
|
+
def test_validation_nonexistent_column(self, numeric_df: pl.DataFrame) -> None:
|
|
589
|
+
"""Test validation catches non-existent column."""
|
|
590
|
+
plan = TransformPlan().math_diff_lag(
|
|
591
|
+
"nonexistent", order_by="a", new_column="diff"
|
|
592
|
+
)
|
|
593
|
+
result = plan.validate(numeric_df)
|
|
594
|
+
assert not result.is_valid
|
|
595
|
+
assert "does not exist" in str(result.errors[0])
|
|
596
|
+
|
|
597
|
+
def test_validation_wrong_type(self, basic_df: pl.DataFrame) -> None:
|
|
598
|
+
"""Test validation catches string column."""
|
|
599
|
+
plan = TransformPlan().math_diff_lag("name", order_by="id", new_column="diff")
|
|
600
|
+
result = plan.validate(basic_df)
|
|
601
|
+
assert not result.is_valid
|
|
602
|
+
assert "numeric or datetime" in str(result.errors[0])
|
|
603
|
+
|
|
604
|
+
def test_validation_missing_order_by(self, numeric_df: pl.DataFrame) -> None:
|
|
605
|
+
"""Test validation catches missing order_by column."""
|
|
606
|
+
plan = TransformPlan().math_diff_lag(
|
|
607
|
+
"a", order_by="nonexistent", new_column="diff"
|
|
608
|
+
)
|
|
609
|
+
result = plan.validate(numeric_df)
|
|
610
|
+
assert not result.is_valid
|
|
611
|
+
assert "Order-by" in str(result.errors[0])
|
|
612
|
+
|
|
613
|
+
def test_validation_missing_group_by(self, numeric_df: pl.DataFrame) -> None:
|
|
614
|
+
"""Test validation catches missing group_by column."""
|
|
615
|
+
plan = TransformPlan().math_diff_lag(
|
|
616
|
+
"a", order_by="a", new_column="diff", group_by="nonexistent"
|
|
617
|
+
)
|
|
618
|
+
result = plan.validate(numeric_df)
|
|
619
|
+
assert not result.is_valid
|
|
620
|
+
assert "Group-by" in str(result.errors[0])
|
|
621
|
+
|
|
622
|
+
def test_serialization_roundtrip(self, numeric_df: pl.DataFrame) -> None:
|
|
623
|
+
"""Test JSON serialization round-trip."""
|
|
624
|
+
plan = TransformPlan().math_diff_lag(
|
|
625
|
+
"a", order_by="a", new_column="diff", group_by="b", lag=2
|
|
626
|
+
)
|
|
627
|
+
json_str = plan.to_json()
|
|
628
|
+
restored = TransformPlan.from_json(json_str)
|
|
629
|
+
result1, _ = plan.process(numeric_df)
|
|
630
|
+
result2, _ = restored.process(numeric_df)
|
|
631
|
+
assert result1["diff"].to_list() == result2["diff"].to_list()
|
|
@@ -309,6 +309,17 @@ class Backend(ABC):
|
|
|
309
309
|
group_by: list[str] | None,
|
|
310
310
|
) -> Any: ...
|
|
311
311
|
|
|
312
|
+
@abstractmethod
|
|
313
|
+
def math_diff_lag(
|
|
314
|
+
self,
|
|
315
|
+
data: Any,
|
|
316
|
+
column: str,
|
|
317
|
+
order_by: list[str],
|
|
318
|
+
new_column: str,
|
|
319
|
+
group_by: list[str] | None,
|
|
320
|
+
lag: int,
|
|
321
|
+
) -> Any: ...
|
|
322
|
+
|
|
312
323
|
@abstractmethod
|
|
313
324
|
def math_standardize(
|
|
314
325
|
self,
|
|
@@ -582,6 +582,26 @@ class DuckDBBackend(Backend):
|
|
|
582
582
|
)
|
|
583
583
|
return self._con.sql(f"SELECT *, {expr} FROM {_sub(data)}")
|
|
584
584
|
|
|
585
|
+
def math_diff_lag(
|
|
586
|
+
self,
|
|
587
|
+
data: duckdb.DuckDBPyRelation,
|
|
588
|
+
column: str,
|
|
589
|
+
order_by: list[str],
|
|
590
|
+
new_column: str,
|
|
591
|
+
group_by: list[str] | None,
|
|
592
|
+
lag: int,
|
|
593
|
+
) -> duckdb.DuckDBPyRelation:
|
|
594
|
+
partition = ""
|
|
595
|
+
if group_by:
|
|
596
|
+
partition = "PARTITION BY " + ", ".join(_q(g) for g in group_by)
|
|
597
|
+
order = "ORDER BY " + ", ".join(_q(o) for o in order_by)
|
|
598
|
+
window = f"{partition} {order}".strip()
|
|
599
|
+
expr = (
|
|
600
|
+
f"({_q(column)} - LAG({_q(column)}, {lag}) OVER ({window})) "
|
|
601
|
+
f"AS {_q(new_column)}"
|
|
602
|
+
)
|
|
603
|
+
return self._con.sql(f"SELECT *, {expr} FROM {_sub(data)}")
|
|
604
|
+
|
|
585
605
|
def math_standardize(
|
|
586
606
|
self,
|
|
587
607
|
data: duckdb.DuckDBPyRelation,
|
|
@@ -366,6 +366,25 @@ class PolarsBackend(Backend):
|
|
|
366
366
|
agg_expr = agg_expr.over(group_by)
|
|
367
367
|
return data.with_columns((pl.col(column) - agg_expr).alias(new_column))
|
|
368
368
|
|
|
369
|
+
def math_diff_lag(
|
|
370
|
+
self,
|
|
371
|
+
data: pl.DataFrame,
|
|
372
|
+
column: str,
|
|
373
|
+
order_by: list[str],
|
|
374
|
+
new_column: str,
|
|
375
|
+
group_by: list[str] | None,
|
|
376
|
+
lag: int,
|
|
377
|
+
) -> pl.DataFrame:
|
|
378
|
+
if group_by:
|
|
379
|
+
expr = pl.col(column) - pl.col(column).shift(lag).over(
|
|
380
|
+
partition_by=group_by, order_by=order_by
|
|
381
|
+
)
|
|
382
|
+
return data.with_columns(expr.alias(new_column))
|
|
383
|
+
data = data.sort(order_by)
|
|
384
|
+
return data.with_columns(
|
|
385
|
+
(pl.col(column) - pl.col(column).shift(lag)).alias(new_column)
|
|
386
|
+
)
|
|
387
|
+
|
|
369
388
|
def math_standardize(
|
|
370
389
|
self,
|
|
371
390
|
data: pl.DataFrame,
|
|
@@ -97,6 +97,7 @@ OPERATION_CHUNK_REGISTRY: dict[str, OperationMeta] = {
|
|
|
97
97
|
"math_diff_from_agg": OperationMeta(
|
|
98
98
|
ChunkMode.GROUP_DEPENDENT, group_param="group_by"
|
|
99
99
|
),
|
|
100
|
+
"math_diff_lag": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="group_by"),
|
|
100
101
|
# String operations - all chunkable
|
|
101
102
|
"str_replace": OperationMeta(ChunkMode.CHUNKABLE),
|
|
102
103
|
"str_slice": OperationMeta(ChunkMode.CHUNKABLE),
|
|
@@ -327,6 +327,46 @@ class MathOps:
|
|
|
327
327
|
},
|
|
328
328
|
)
|
|
329
329
|
|
|
330
|
+
def math_diff_lag(
|
|
331
|
+
self,
|
|
332
|
+
column: str,
|
|
333
|
+
*,
|
|
334
|
+
order_by: str | list[str],
|
|
335
|
+
new_column: str,
|
|
336
|
+
group_by: str | list[str] | None = None,
|
|
337
|
+
lag: int = 1,
|
|
338
|
+
) -> Self:
|
|
339
|
+
"""Compute row-to-row difference using lag.
|
|
340
|
+
|
|
341
|
+
Calculates column - LAG(column, lag) ordered by order_by and optionally
|
|
342
|
+
partitioned by group_by. Works on numeric columns (result is float) and
|
|
343
|
+
datetime columns (result is duration).
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
column: Source column (numeric or datetime).
|
|
347
|
+
order_by: Column(s) defining row order.
|
|
348
|
+
new_column: Name for result column.
|
|
349
|
+
group_by: Column(s) to partition by. None for global ordering.
|
|
350
|
+
lag: Number of rows to look back (must be >= 1).
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Self for method chaining.
|
|
354
|
+
"""
|
|
355
|
+
if isinstance(order_by, str):
|
|
356
|
+
order_by = [order_by]
|
|
357
|
+
if isinstance(group_by, str):
|
|
358
|
+
group_by = [group_by]
|
|
359
|
+
return self._register(
|
|
360
|
+
"math_diff_lag",
|
|
361
|
+
{
|
|
362
|
+
"column": column,
|
|
363
|
+
"order_by": order_by,
|
|
364
|
+
"new_column": new_column,
|
|
365
|
+
"group_by": group_by,
|
|
366
|
+
"lag": lag,
|
|
367
|
+
},
|
|
368
|
+
)
|
|
369
|
+
|
|
330
370
|
# =========================================================================
|
|
331
371
|
# Scaling Operations
|
|
332
372
|
# =========================================================================
|
|
@@ -961,6 +961,51 @@ def _validate_math_diff_from_agg(
|
|
|
961
961
|
tracker.add_column(new_column, tracker.float_type)
|
|
962
962
|
|
|
963
963
|
|
|
964
|
+
def _validate_math_diff_lag(
|
|
965
|
+
tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int
|
|
966
|
+
) -> None:
|
|
967
|
+
column = params["column"]
|
|
968
|
+
order_by = params["order_by"]
|
|
969
|
+
new_column = params["new_column"]
|
|
970
|
+
group_by = params.get("group_by")
|
|
971
|
+
|
|
972
|
+
if _check_column_exists(tracker, column, result, step, "math_diff_lag"):
|
|
973
|
+
dtype = tracker.get_dtype(column)
|
|
974
|
+
if not (tracker.is_numeric(dtype) or tracker.is_datetime(dtype)):
|
|
975
|
+
result.add_error(
|
|
976
|
+
step,
|
|
977
|
+
"math_diff_lag",
|
|
978
|
+
f"Column '{column}' must be numeric or datetime, "
|
|
979
|
+
f"got {tracker.type_name(dtype)}",
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
missing_order = [c for c in order_by if not tracker.has_column(c)]
|
|
983
|
+
if missing_order:
|
|
984
|
+
result.add_error(
|
|
985
|
+
step,
|
|
986
|
+
"math_diff_lag",
|
|
987
|
+
f"Order-by columns do not exist: {missing_order}",
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
if group_by:
|
|
991
|
+
missing_group = [c for c in group_by if not tracker.has_column(c)]
|
|
992
|
+
if missing_group:
|
|
993
|
+
result.add_error(
|
|
994
|
+
step,
|
|
995
|
+
"math_diff_lag",
|
|
996
|
+
f"Group-by columns do not exist: {missing_group}",
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
if tracker.has_column(column):
|
|
1000
|
+
dtype = tracker.get_dtype(column)
|
|
1001
|
+
out_type = (
|
|
1002
|
+
tracker.duration_type if tracker.is_datetime(dtype) else tracker.float_type
|
|
1003
|
+
)
|
|
1004
|
+
tracker.add_column(new_column, out_type)
|
|
1005
|
+
else:
|
|
1006
|
+
tracker.add_column(new_column, tracker.float_type)
|
|
1007
|
+
|
|
1008
|
+
|
|
964
1009
|
def _validate_math_percent_of(
|
|
965
1010
|
tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int
|
|
966
1011
|
) -> None:
|
|
@@ -1590,6 +1635,7 @@ _VALIDATORS: dict[str, ValidatorFunc] = {
|
|
|
1590
1635
|
"math_cumsum": _validate_math_cumsum,
|
|
1591
1636
|
"math_rank": _validate_math_rank,
|
|
1592
1637
|
"math_diff_from_agg": _validate_math_diff_from_agg,
|
|
1638
|
+
"math_diff_lag": _validate_math_diff_lag,
|
|
1593
1639
|
"math_percent_of": _validate_math_percent_of,
|
|
1594
1640
|
# Scaling ops
|
|
1595
1641
|
"math_standardize": partial(_validate_math_scaling, op_name="math_standardize"),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: transformplan
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Safe, reproducible data transformations with built-in auditing and validation
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/limebit/transformplan
|
|
@@ -52,7 +52,7 @@ Dynamic: license-file
|
|
|
52
52
|
```python
|
|
53
53
|
from transformplan import TransformPlan, Col
|
|
54
54
|
|
|
55
|
-
# Build readable pipelines with
|
|
55
|
+
# Build readable pipelines with 89 chainable operations
|
|
56
56
|
plan = (
|
|
57
57
|
TransformPlan()
|
|
58
58
|
# Standardize column names
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|