transformplan 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {transformplan-0.1.3/transformplan.egg-info → transformplan-0.2.0}/PKG-INFO +2 -2
  2. {transformplan-0.1.3 → transformplan-0.2.0}/README.md +1 -1
  3. {transformplan-0.1.3 → transformplan-0.2.0}/pyproject.toml +1 -1
  4. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_duckdb.py +125 -0
  5. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_math.py +164 -0
  6. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/base.py +11 -0
  7. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/duckdb.py +20 -0
  8. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/polars.py +19 -0
  9. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/chunking.py +1 -0
  10. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/math.py +40 -0
  11. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/validation.py +46 -0
  12. {transformplan-0.1.3 → transformplan-0.2.0/transformplan.egg-info}/PKG-INFO +2 -2
  13. {transformplan-0.1.3 → transformplan-0.2.0}/LICENSE +0 -0
  14. {transformplan-0.1.3 → transformplan-0.2.0}/setup.cfg +0 -0
  15. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_chunking.py +0 -0
  16. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_column.py +0 -0
  17. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_core.py +0 -0
  18. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_datetime.py +0 -0
  19. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_filters.py +0 -0
  20. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_integration.py +0 -0
  21. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_join.py +0 -0
  22. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_map.py +0 -0
  23. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_map_encoding.py +0 -0
  24. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_math_scaling.py +0 -0
  25. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_protocol.py +0 -0
  26. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_rows.py +0 -0
  27. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_string.py +0 -0
  28. {transformplan-0.1.3 → transformplan-0.2.0}/tests/test_validation.py +0 -0
  29. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/__init__.py +0 -0
  30. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/backends/__init__.py +0 -0
  31. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/core.py +0 -0
  32. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/filters.py +0 -0
  33. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/__init__.py +0 -0
  34. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/column.py +0 -0
  35. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/datetime.py +0 -0
  36. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/join.py +0 -0
  37. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/map.py +0 -0
  38. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/rows.py +0 -0
  39. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/ops/string.py +0 -0
  40. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/plan.py +0 -0
  41. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/protocol.py +0 -0
  42. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/py.typed +0 -0
  43. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan/sql_utils.py +0 -0
  44. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/SOURCES.txt +0 -0
  45. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/dependency_links.txt +0 -0
  46. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/requires.txt +0 -0
  47. {transformplan-0.1.3 → transformplan-0.2.0}/transformplan.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: transformplan
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Safe, reproducible data transformations with built-in auditing and validation
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://github.com/limebit/transformplan
@@ -52,7 +52,7 @@ Dynamic: license-file
52
52
  ```python
53
53
  from transformplan import TransformPlan, Col
54
54
 
55
- # Build readable pipelines with 88 chainable operations
55
+ # Build readable pipelines with 89 chainable operations
56
56
  plan = (
57
57
  TransformPlan()
58
58
  # Standardize column names
@@ -22,7 +22,7 @@
22
22
  ```python
23
23
  from transformplan import TransformPlan, Col
24
24
 
25
- # Build readable pipelines with 88 chainable operations
25
+ # Build readable pipelines with 89 chainable operations
26
26
  plan = (
27
27
  TransformPlan()
28
28
  # Standardize column names
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "transformplan"
3
- version = "0.1.3"
3
+ version = "0.2.0"
4
4
  description = "Safe, reproducible data transformations with built-in auditing and validation"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1617,6 +1617,131 @@ class TestMathDiffFromAgg:
1617
1617
  )
1618
1618
 
1619
1619
 
1620
+ class TestMathDiffLag:
1621
+ """Tests for math_diff_lag with DuckDB backend."""
1622
+
1623
+ def test_numeric_basic(
1624
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1625
+ ) -> None:
1626
+ rel = con.sql(
1627
+ "SELECT * FROM (VALUES (1, 10), (2, 30), (3, 35), (4, 50)) AS t(id, val)"
1628
+ )
1629
+ result, _ = (
1630
+ TransformPlan()
1631
+ .math_diff_lag("val", order_by="id", new_column="diff")
1632
+ .process(rel, backend=backend)
1633
+ )
1634
+ vals = _col_values(result, "diff")
1635
+ assert vals[0] is None
1636
+ assert vals[1:] == [20, 5, 15]
1637
+
1638
+ def test_numeric_lag2(
1639
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1640
+ ) -> None:
1641
+ rel = con.sql(
1642
+ "SELECT * FROM (VALUES (1, 10), (2, 30), (3, 35), (4, 50)) AS t(id, val)"
1643
+ )
1644
+ result, _ = (
1645
+ TransformPlan()
1646
+ .math_diff_lag("val", order_by="id", new_column="diff", lag=2)
1647
+ .process(rel, backend=backend)
1648
+ )
1649
+ vals = _col_values(result, "diff")
1650
+ assert vals[0] is None
1651
+ assert vals[1] is None
1652
+ assert vals[2:] == [25, 20]
1653
+
1654
+ def test_grouped_numeric(
1655
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1656
+ ) -> None:
1657
+ rel = con.sql(
1658
+ "SELECT * FROM (VALUES "
1659
+ "('A', 1, 10), ('A', 2, 30), ('A', 3, 35), "
1660
+ "('B', 1, 100), ('B', 2, 150), ('B', 3, 160)"
1661
+ ") AS t(grp, seq, val)"
1662
+ )
1663
+ result, _ = (
1664
+ TransformPlan()
1665
+ .math_diff_lag("val", order_by="seq", new_column="diff", group_by="grp")
1666
+ .rows_sort(["grp", "seq"])
1667
+ .process(rel, backend=backend)
1668
+ )
1669
+ vals = _col_values(result, "diff")
1670
+ assert vals == [None, 20, 5, None, 50, 10]
1671
+
1672
+ def test_datetime_column(
1673
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1674
+ ) -> None:
1675
+ rel = con.sql(
1676
+ "SELECT * FROM (VALUES "
1677
+ "(1, TIMESTAMP '2024-01-01 00:00:00'), "
1678
+ "(2, TIMESTAMP '2024-01-01 01:00:00'), "
1679
+ "(3, TIMESTAMP '2024-01-01 03:00:00')"
1680
+ ") AS t(id, ts)"
1681
+ )
1682
+ result, _ = (
1683
+ TransformPlan()
1684
+ .math_diff_lag("ts", order_by="id", new_column="gap")
1685
+ .process(rel, backend=backend)
1686
+ )
1687
+ vals = _col_values(result, "gap")
1688
+ assert vals[0] is None
1689
+ assert vals[1].total_seconds() == 3600
1690
+ assert vals[2].total_seconds() == 7200
1691
+
1692
+ def test_datetime_grouped(
1693
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1694
+ ) -> None:
1695
+ rel = con.sql(
1696
+ "SELECT * FROM (VALUES "
1697
+ "('A', TIMESTAMP '2024-01-01 00:00:00'), "
1698
+ "('A', TIMESTAMP '2024-01-01 02:00:00'), "
1699
+ "('B', TIMESTAMP '2024-01-01 10:00:00'), "
1700
+ "('B', TIMESTAMP '2024-01-01 13:00:00')"
1701
+ ") AS t(patient, ts)"
1702
+ )
1703
+ result, _ = (
1704
+ TransformPlan()
1705
+ .math_diff_lag("ts", order_by="ts", new_column="gap", group_by="patient")
1706
+ .rows_sort(["patient", "ts"])
1707
+ .process(rel, backend=backend)
1708
+ )
1709
+ vals = _col_values(result, "gap")
1710
+ assert vals[0] is None
1711
+ assert vals[1].total_seconds() / 3600 == 2.0
1712
+ assert vals[2] is None
1713
+ assert vals[3].total_seconds() / 3600 == 3.0
1714
+
1715
+ def test_order_by_list(
1716
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1717
+ ) -> None:
1718
+ rel = con.sql(
1719
+ "SELECT * FROM (VALUES "
1720
+ "(1, 1, 10), (1, 2, 20), (2, 1, 30), (2, 2, 40)"
1721
+ ") AS t(a, b, val)"
1722
+ )
1723
+ result, _ = (
1724
+ TransformPlan()
1725
+ .math_diff_lag("val", order_by=["a", "b"], new_column="diff")
1726
+ .process(rel, backend=backend)
1727
+ )
1728
+ vals = _col_values(result, "diff")
1729
+ assert vals == [None, 10, 10, 10]
1730
+
1731
+ def test_global_no_group(
1732
+ self, backend: DuckDBBackend, con: duckdb.DuckDBPyConnection
1733
+ ) -> None:
1734
+ rel = con.sql("SELECT * FROM (VALUES (3, 30), (1, 10), (2, 20)) AS t(seq, val)")
1735
+ result, _ = (
1736
+ TransformPlan()
1737
+ .math_diff_lag("val", order_by="seq", new_column="diff")
1738
+ .process(rel, backend=backend)
1739
+ )
1740
+ vals = _col_values(result, "diff")
1741
+ assert vals[0] is None
1742
+ assert vals[1:] == [10, 10]
1743
+
1744
+
1620
1745
  class TestColExpr:
1621
1746
  """Tests for col_expr on DuckDB backend."""
1622
1747
 
@@ -465,3 +465,167 @@ class TestMathDiffFromAgg:
465
465
  result1, _ = plan.process(numeric_df)
466
466
  result2, _ = restored.process(numeric_df)
467
467
  assert result1["diff"].to_list() == result2["diff"].to_list()
468
+
469
+
470
+ class TestMathDiffLag:
471
+ """Tests for math_diff_lag operation."""
472
+
473
+ def test_numeric_basic(self) -> None:
474
+ """Test lag=1 on integers ordered by id; first row null."""
475
+ df = pl.DataFrame({"id": [1, 2, 3, 4], "val": [10, 30, 35, 50]})
476
+ plan = TransformPlan().math_diff_lag("val", order_by="id", new_column="diff")
477
+ result, _ = plan.process(df)
478
+ assert result["diff"].to_list() == [None, 20.0, 5.0, 15.0]
479
+
480
+ def test_numeric_lag2(self) -> None:
481
+ """Test lag=2; first two rows null."""
482
+ df = pl.DataFrame({"id": [1, 2, 3, 4], "val": [10, 30, 35, 50]})
483
+ plan = TransformPlan().math_diff_lag(
484
+ "val", order_by="id", new_column="diff", lag=2
485
+ )
486
+ result, _ = plan.process(df)
487
+ assert result["diff"].to_list() == [None, None, 25.0, 20.0]
488
+
489
+ def test_grouped_numeric(self) -> None:
490
+ """Test partition by group; nulls restart per group."""
491
+ df = pl.DataFrame(
492
+ {
493
+ "grp": ["A", "A", "A", "B", "B", "B"],
494
+ "seq": [1, 2, 3, 1, 2, 3],
495
+ "val": [10, 30, 35, 100, 150, 160],
496
+ }
497
+ )
498
+ plan = TransformPlan().math_diff_lag(
499
+ "val", order_by="seq", new_column="diff", group_by="grp"
500
+ )
501
+ result, _ = plan.process(df)
502
+ expected = [None, 20.0, 5.0, None, 50.0, 10.0]
503
+ assert result["diff"].to_list() == expected
504
+
505
+ def test_datetime_column(self) -> None:
506
+ """Test datetime input produces duration output."""
507
+ df = pl.DataFrame(
508
+ {
509
+ "id": [1, 2, 3],
510
+ "ts": [
511
+ datetime(2024, 1, 1, 0, 0),
512
+ datetime(2024, 1, 1, 1, 0),
513
+ datetime(2024, 1, 1, 3, 0),
514
+ ],
515
+ }
516
+ )
517
+ plan = TransformPlan().math_diff_lag("ts", order_by="id", new_column="gap")
518
+ result, _ = plan.process(df)
519
+ assert result["gap"].dtype == pl.Duration
520
+ vals = result["gap"].to_list()
521
+ assert vals[0] is None
522
+ assert vals[1].total_seconds() == 3600
523
+ assert vals[2].total_seconds() == 7200
524
+
525
+ def test_datetime_grouped(self) -> None:
526
+ """Test primary use case: time between events per patient."""
527
+ df = pl.DataFrame(
528
+ {
529
+ "patient": ["A", "A", "B", "B"],
530
+ "ts": [
531
+ datetime(2024, 1, 1, 0, 0),
532
+ datetime(2024, 1, 1, 2, 0),
533
+ datetime(2024, 1, 1, 10, 0),
534
+ datetime(2024, 1, 1, 13, 0),
535
+ ],
536
+ }
537
+ )
538
+ plan = TransformPlan().math_diff_lag(
539
+ "ts", order_by="ts", new_column="gap", group_by="patient"
540
+ )
541
+ result, _ = plan.process(df)
542
+ assert result["gap"].dtype == pl.Duration
543
+ vals = result["gap"].to_list()
544
+ assert vals[0] is None
545
+ assert vals[1].total_seconds() / 3600 == 2.0
546
+ assert vals[2] is None
547
+ assert vals[3].total_seconds() / 3600 == 3.0
548
+
549
+ def test_order_by_different_column(self) -> None:
550
+ """Test diffing 'value' ordered by 'timestamp'."""
551
+ df = pl.DataFrame(
552
+ {
553
+ "ts": [
554
+ datetime(2024, 1, 1),
555
+ datetime(2024, 1, 2),
556
+ datetime(2024, 1, 3),
557
+ ],
558
+ "val": [100, 130, 125],
559
+ }
560
+ )
561
+ plan = TransformPlan().math_diff_lag("val", order_by="ts", new_column="change")
562
+ result, _ = plan.process(df)
563
+ assert result["change"].to_list() == [None, 30.0, -5.0]
564
+
565
+ def test_order_by_list(self) -> None:
566
+ """Test multi-column order_by."""
567
+ df = pl.DataFrame(
568
+ {
569
+ "a": [1, 1, 2, 2],
570
+ "b": [1, 2, 1, 2],
571
+ "val": [10, 20, 30, 40],
572
+ }
573
+ )
574
+ plan = TransformPlan().math_diff_lag(
575
+ "val", order_by=["a", "b"], new_column="diff"
576
+ )
577
+ result, _ = plan.process(df)
578
+ assert result["diff"].to_list() == [None, 10.0, 10.0, 10.0]
579
+
580
+ def test_global_no_group(self) -> None:
581
+ """Test no group_by, global ordering."""
582
+ df = pl.DataFrame({"seq": [3, 1, 2], "val": [30, 10, 20]})
583
+ plan = TransformPlan().math_diff_lag("val", order_by="seq", new_column="diff")
584
+ result, _ = plan.process(df)
585
+ # After sorting by seq: [10, 20, 30], diffs: [None, 10, 10]
586
+ assert result["diff"].to_list() == [None, 10.0, 10.0]
587
+
588
+ def test_validation_nonexistent_column(self, numeric_df: pl.DataFrame) -> None:
589
+ """Test validation catches non-existent column."""
590
+ plan = TransformPlan().math_diff_lag(
591
+ "nonexistent", order_by="a", new_column="diff"
592
+ )
593
+ result = plan.validate(numeric_df)
594
+ assert not result.is_valid
595
+ assert "does not exist" in str(result.errors[0])
596
+
597
+ def test_validation_wrong_type(self, basic_df: pl.DataFrame) -> None:
598
+ """Test validation catches string column."""
599
+ plan = TransformPlan().math_diff_lag("name", order_by="id", new_column="diff")
600
+ result = plan.validate(basic_df)
601
+ assert not result.is_valid
602
+ assert "numeric or datetime" in str(result.errors[0])
603
+
604
+ def test_validation_missing_order_by(self, numeric_df: pl.DataFrame) -> None:
605
+ """Test validation catches missing order_by column."""
606
+ plan = TransformPlan().math_diff_lag(
607
+ "a", order_by="nonexistent", new_column="diff"
608
+ )
609
+ result = plan.validate(numeric_df)
610
+ assert not result.is_valid
611
+ assert "Order-by" in str(result.errors[0])
612
+
613
+ def test_validation_missing_group_by(self, numeric_df: pl.DataFrame) -> None:
614
+ """Test validation catches missing group_by column."""
615
+ plan = TransformPlan().math_diff_lag(
616
+ "a", order_by="a", new_column="diff", group_by="nonexistent"
617
+ )
618
+ result = plan.validate(numeric_df)
619
+ assert not result.is_valid
620
+ assert "Group-by" in str(result.errors[0])
621
+
622
+ def test_serialization_roundtrip(self, numeric_df: pl.DataFrame) -> None:
623
+ """Test JSON serialization round-trip."""
624
+ plan = TransformPlan().math_diff_lag(
625
+ "a", order_by="a", new_column="diff", group_by="b", lag=2
626
+ )
627
+ json_str = plan.to_json()
628
+ restored = TransformPlan.from_json(json_str)
629
+ result1, _ = plan.process(numeric_df)
630
+ result2, _ = restored.process(numeric_df)
631
+ assert result1["diff"].to_list() == result2["diff"].to_list()
@@ -309,6 +309,17 @@ class Backend(ABC):
309
309
  group_by: list[str] | None,
310
310
  ) -> Any: ...
311
311
 
312
+ @abstractmethod
313
+ def math_diff_lag(
314
+ self,
315
+ data: Any,
316
+ column: str,
317
+ order_by: list[str],
318
+ new_column: str,
319
+ group_by: list[str] | None,
320
+ lag: int,
321
+ ) -> Any: ...
322
+
312
323
  @abstractmethod
313
324
  def math_standardize(
314
325
  self,
@@ -582,6 +582,26 @@ class DuckDBBackend(Backend):
582
582
  )
583
583
  return self._con.sql(f"SELECT *, {expr} FROM {_sub(data)}")
584
584
 
585
+ def math_diff_lag(
586
+ self,
587
+ data: duckdb.DuckDBPyRelation,
588
+ column: str,
589
+ order_by: list[str],
590
+ new_column: str,
591
+ group_by: list[str] | None,
592
+ lag: int,
593
+ ) -> duckdb.DuckDBPyRelation:
594
+ partition = ""
595
+ if group_by:
596
+ partition = "PARTITION BY " + ", ".join(_q(g) for g in group_by)
597
+ order = "ORDER BY " + ", ".join(_q(o) for o in order_by)
598
+ window = f"{partition} {order}".strip()
599
+ expr = (
600
+ f"({_q(column)} - LAG({_q(column)}, {lag}) OVER ({window})) "
601
+ f"AS {_q(new_column)}"
602
+ )
603
+ return self._con.sql(f"SELECT *, {expr} FROM {_sub(data)}")
604
+
585
605
  def math_standardize(
586
606
  self,
587
607
  data: duckdb.DuckDBPyRelation,
@@ -366,6 +366,25 @@ class PolarsBackend(Backend):
366
366
  agg_expr = agg_expr.over(group_by)
367
367
  return data.with_columns((pl.col(column) - agg_expr).alias(new_column))
368
368
 
369
+ def math_diff_lag(
370
+ self,
371
+ data: pl.DataFrame,
372
+ column: str,
373
+ order_by: list[str],
374
+ new_column: str,
375
+ group_by: list[str] | None,
376
+ lag: int,
377
+ ) -> pl.DataFrame:
378
+ if group_by:
379
+ expr = pl.col(column) - pl.col(column).shift(lag).over(
380
+ partition_by=group_by, order_by=order_by
381
+ )
382
+ return data.with_columns(expr.alias(new_column))
383
+ data = data.sort(order_by)
384
+ return data.with_columns(
385
+ (pl.col(column) - pl.col(column).shift(lag)).alias(new_column)
386
+ )
387
+
369
388
  def math_standardize(
370
389
  self,
371
390
  data: pl.DataFrame,
@@ -97,6 +97,7 @@ OPERATION_CHUNK_REGISTRY: dict[str, OperationMeta] = {
97
97
  "math_diff_from_agg": OperationMeta(
98
98
  ChunkMode.GROUP_DEPENDENT, group_param="group_by"
99
99
  ),
100
+ "math_diff_lag": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="group_by"),
100
101
  # String operations - all chunkable
101
102
  "str_replace": OperationMeta(ChunkMode.CHUNKABLE),
102
103
  "str_slice": OperationMeta(ChunkMode.CHUNKABLE),
@@ -327,6 +327,46 @@ class MathOps:
327
327
  },
328
328
  )
329
329
 
330
+ def math_diff_lag(
331
+ self,
332
+ column: str,
333
+ *,
334
+ order_by: str | list[str],
335
+ new_column: str,
336
+ group_by: str | list[str] | None = None,
337
+ lag: int = 1,
338
+ ) -> Self:
339
+ """Compute row-to-row difference using lag.
340
+
341
+ Calculates column - LAG(column, lag) ordered by order_by and optionally
342
+ partitioned by group_by. Works on numeric columns (result is float) and
343
+ datetime columns (result is duration).
344
+
345
+ Args:
346
+ column: Source column (numeric or datetime).
347
+ order_by: Column(s) defining row order.
348
+ new_column: Name for result column.
349
+ group_by: Column(s) to partition by. None for global ordering.
350
+ lag: Number of rows to look back (must be >= 1).
351
+
352
+ Returns:
353
+ Self for method chaining.
354
+ """
355
+ if isinstance(order_by, str):
356
+ order_by = [order_by]
357
+ if isinstance(group_by, str):
358
+ group_by = [group_by]
359
+ return self._register(
360
+ "math_diff_lag",
361
+ {
362
+ "column": column,
363
+ "order_by": order_by,
364
+ "new_column": new_column,
365
+ "group_by": group_by,
366
+ "lag": lag,
367
+ },
368
+ )
369
+
330
370
  # =========================================================================
331
371
  # Scaling Operations
332
372
  # =========================================================================
@@ -961,6 +961,51 @@ def _validate_math_diff_from_agg(
961
961
  tracker.add_column(new_column, tracker.float_type)
962
962
 
963
963
 
964
+ def _validate_math_diff_lag(
965
+ tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int
966
+ ) -> None:
967
+ column = params["column"]
968
+ order_by = params["order_by"]
969
+ new_column = params["new_column"]
970
+ group_by = params.get("group_by")
971
+
972
+ if _check_column_exists(tracker, column, result, step, "math_diff_lag"):
973
+ dtype = tracker.get_dtype(column)
974
+ if not (tracker.is_numeric(dtype) or tracker.is_datetime(dtype)):
975
+ result.add_error(
976
+ step,
977
+ "math_diff_lag",
978
+ f"Column '{column}' must be numeric or datetime, "
979
+ f"got {tracker.type_name(dtype)}",
980
+ )
981
+
982
+ missing_order = [c for c in order_by if not tracker.has_column(c)]
983
+ if missing_order:
984
+ result.add_error(
985
+ step,
986
+ "math_diff_lag",
987
+ f"Order-by columns do not exist: {missing_order}",
988
+ )
989
+
990
+ if group_by:
991
+ missing_group = [c for c in group_by if not tracker.has_column(c)]
992
+ if missing_group:
993
+ result.add_error(
994
+ step,
995
+ "math_diff_lag",
996
+ f"Group-by columns do not exist: {missing_group}",
997
+ )
998
+
999
+ if tracker.has_column(column):
1000
+ dtype = tracker.get_dtype(column)
1001
+ out_type = (
1002
+ tracker.duration_type if tracker.is_datetime(dtype) else tracker.float_type
1003
+ )
1004
+ tracker.add_column(new_column, out_type)
1005
+ else:
1006
+ tracker.add_column(new_column, tracker.float_type)
1007
+
1008
+
964
1009
  def _validate_math_percent_of(
965
1010
  tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int
966
1011
  ) -> None:
@@ -1590,6 +1635,7 @@ _VALIDATORS: dict[str, ValidatorFunc] = {
1590
1635
  "math_cumsum": _validate_math_cumsum,
1591
1636
  "math_rank": _validate_math_rank,
1592
1637
  "math_diff_from_agg": _validate_math_diff_from_agg,
1638
+ "math_diff_lag": _validate_math_diff_lag,
1593
1639
  "math_percent_of": _validate_math_percent_of,
1594
1640
  # Scaling ops
1595
1641
  "math_standardize": partial(_validate_math_scaling, op_name="math_standardize"),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: transformplan
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Safe, reproducible data transformations with built-in auditing and validation
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://github.com/limebit/transformplan
@@ -52,7 +52,7 @@ Dynamic: license-file
52
52
  ```python
53
53
  from transformplan import TransformPlan, Col
54
54
 
55
- # Build readable pipelines with 88 chainable operations
55
+ # Build readable pipelines with 89 chainable operations
56
56
  plan = (
57
57
  TransformPlan()
58
58
  # Standardize column names
File without changes
File without changes