df-eval 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
df_eval/__init__.py ADDED
@@ -0,0 +1,53 @@
1
+ """
2
+ df-eval: A lightweight expression evaluation engine for pandas DataFrames.
3
+
4
+ This package provides tools for evaluating expressions on pandas DataFrames,
5
+ supporting schema-driven derived columns and external lookups.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ from df_eval.engine import Engine, CycleDetectedError
11
+ from df_eval.expr import Expression
12
+ from df_eval.lookup import (
13
+ lookup,
14
+ Resolver,
15
+ CachedResolver,
16
+ DictResolver,
17
+ FileResolver,
18
+ DatabaseResolver,
19
+ HTTPResolver,
20
+ )
21
+ from df_eval.pandera import (
22
+ df_eval_schema_from_pandera,
23
+ apply_pandera_schema,
24
+ apply_pandera_schema_parquet_to_parquet,
25
+ load_pandera_schema_yaml,
26
+ dump_pandera_schema_yaml,
27
+ load_pandera_schema_json,
28
+ dump_pandera_schema_json,
29
+ )
30
+ from df_eval.parquet import iter_parquet_row_chunks, write_parquet_row_chunks
31
+
32
+ __all__ = [
33
+ "Engine",
34
+ "Expression",
35
+ "CycleDetectedError",
36
+ "lookup",
37
+ "Resolver",
38
+ "CachedResolver",
39
+ "DictResolver",
40
+ "FileResolver",
41
+ "DatabaseResolver",
42
+ "HTTPResolver",
43
+ "df_eval_schema_from_pandera",
44
+ "apply_pandera_schema",
45
+ "apply_pandera_schema_parquet_to_parquet",
46
+ "load_pandera_schema_yaml",
47
+ "dump_pandera_schema_yaml",
48
+ "load_pandera_schema_json",
49
+ "dump_pandera_schema_json",
50
+ "iter_parquet_row_chunks",
51
+ "write_parquet_row_chunks",
52
+ "__version__",
53
+ ]
df_eval/engine.py ADDED
@@ -0,0 +1,611 @@
1
+ """
2
+ Evaluation engine module.
3
+
4
+ This module provides the Engine class for evaluating expressions
5
+ on pandas DataFrames with support for UDF registry, schema-driven
6
+ derived columns with topological ordering, and provenance tracking.
7
+ """
8
+
9
+ from collections.abc import Iterator, Sequence
10
+ from pathlib import Path
11
+
12
+ import pandas as pd
13
+ from typing import Any, Callable, Dict, List, Optional, Set
14
+
15
+ from df_eval.expr import Expression
16
+ from df_eval.functions import BUILTIN_FUNCTIONS
17
+ from df_eval.parquet import iter_parquet_row_chunks, write_parquet_row_chunks
18
+ from df_eval.lookup import Resolver, lookup as _lookup
19
+
20
+
21
+ class CycleDetectedError(Exception):
22
+ """Raised when a cycle is detected in column dependencies."""
23
+ pass
24
+
25
+
26
+ class Engine:
27
+ """
28
+ Engine for evaluating expressions on pandas DataFrames.
29
+
30
+ The Engine class provides methods to evaluate expressions,
31
+ apply transformations, and manage UDF/constant registries.
32
+ """
33
+
34
+ def __init__(self) -> None:
35
+ """Initialize the evaluation engine."""
36
+ self.functions = BUILTIN_FUNCTIONS.copy()
37
+ self.constants: Dict[str, Any] = {}
38
+ # Registry of external lookup resolvers (e.g., DictResolver instances)
39
+ # that can be referenced by name from expressions via the ``lookup``
40
+ # helper function.
41
+ self.resolvers: Dict[str, Resolver] = {}
42
+ # Registry of metadata-driven pipeline functions. These functions are
43
+ # invoked by higher-level orchestration (e.g., Pandera integration)
44
+ # rather than directly from pandas.eval expressions.
45
+ self.pipeline_functions: Dict[str, Callable[..., Any]] = {}
46
+ self._track_provenance = False
47
+
48
+ def enable_provenance(self, enabled: bool = True) -> None:
49
+ """
50
+ Enable or disable provenance tracking.
51
+
52
+ Args:
53
+ enabled: Whether to track provenance in df.attrs.
54
+ """
55
+ self._track_provenance = enabled
56
+
57
+ def register_function(self, name: str, func: Callable[..., Any]) -> None:
58
+ """
59
+ Register a custom function (UDF) for use in expressions.
60
+
61
+ Args:
62
+ name: The name to register the function under.
63
+ func: The function to register.
64
+ """
65
+ self.functions[name] = func
66
+
67
+ def register_constant(self, name: str, value: Any) -> None:
68
+ """
69
+ Register a constant for use in expressions.
70
+
71
+ Args:
72
+ name: The name to register the constant under.
73
+ value: The constant value.
74
+ """
75
+ self.constants[name] = value
76
+
77
+ def register_resolver(self, name: str, resolver: Resolver) -> None:
78
+ """Register a lookup resolver for use in expressions.
79
+
80
+ Registered resolvers can be referenced by name from expressions via
81
+ the :func:`lookup` helper, for example::
82
+
83
+ engine.register_resolver("prices", price_resolver)
84
+ schema = {"price": "lookup(product, prices)"}
85
+
86
+ Args:
87
+ name: Name to register the resolver under.
88
+ resolver: Resolver instance (e.g., :class:`DictResolver`).
89
+ """
90
+ self.resolvers[name] = resolver
91
+
92
+ def register_pipeline_function(self, name: str, func: Callable[..., Any]) -> None:
93
+ """Register a named pipeline function for metadata-driven workflows.
94
+
95
+ Pipeline functions are invoked by higher-level orchestration layers
96
+ (for example, Pandera-driven schemas) based on column metadata rather
97
+ than being called directly from df-eval expression strings. A pipeline
98
+ function typically accepts a ``pandas.DataFrame`` slice and optional
99
+ keyword arguments, and returns either a ``Series`` or ``DataFrame``
100
+ aligned with the input index.
101
+ """
102
+ self.pipeline_functions[name] = func
103
+
104
+ def evaluate(
105
+ self,
106
+ df: pd.DataFrame,
107
+ expr: str | Expression,
108
+ dtype: Optional[str] = None
109
+ ) -> Any:
110
+ """
111
+ Evaluate an expression on a DataFrame.
112
+
113
+ Args:
114
+ df: The DataFrame to evaluate the expression on.
115
+ expr: The expression to evaluate (string or Expression object).
116
+ dtype: Optional dtype to cast the result to.
117
+
118
+ Returns:
119
+ The result of evaluating the expression.
120
+
121
+ Raises:
122
+ ValueError: If the expression is invalid.
123
+ """
124
+ if isinstance(expr, str):
125
+ expr = Expression(expr)
126
+
127
+ # Use pandas eval for expressions
128
+ try:
129
+ # Pass constants as resolvers in the evaluation so they behave
130
+ # like variables in the expression namespace.
131
+ result = df.eval(
132
+ expr.expr_str,
133
+ resolvers=[self.constants, self.functions],
134
+ )
135
+
136
+ # Apply dtype cast if specified
137
+ if dtype is not None and isinstance(result, pd.Series):
138
+ result = result.astype(dtype)
139
+
140
+ return result
141
+ except Exception as e:
142
+ raise ValueError(f"Failed to evaluate expression '{expr.expr_str}': {e}") from e
143
+
144
+ def evaluate_many(
145
+ self,
146
+ df: pd.DataFrame,
147
+ expressions: Dict[str, str | Expression]
148
+ ) -> pd.DataFrame:
149
+ """
150
+ Evaluate multiple expressions and add them as columns.
151
+
152
+ This is an alias for apply_schema for batch evaluation.
153
+
154
+ Args:
155
+ df: The input DataFrame.
156
+ expressions: A dictionary mapping column names to expressions.
157
+
158
+ Returns:
159
+ A new DataFrame with the evaluated columns added.
160
+ """
161
+ return self.apply_schema(df, expressions)
162
+
163
+ def apply_schema(
164
+ self,
165
+ df: pd.DataFrame,
166
+ schema: Dict[str, str | Expression],
167
+ dtypes: Optional[Dict[str, str]] = None
168
+ ) -> pd.DataFrame:
169
+ """
170
+ Apply a schema of derived columns to a DataFrame with topological ordering.
171
+
172
+ This method automatically handles dependencies between columns and
173
+ detects cycles in the dependency graph.
174
+
175
+ Args:
176
+ df: The input DataFrame.
177
+ schema: A dictionary mapping column names to expressions.
178
+ dtypes: Optional dictionary mapping column names to dtypes.
179
+
180
+ Returns:
181
+ A new DataFrame with the derived columns added.
182
+
183
+ Raises:
184
+ CycleDetectedError: If a cycle is detected in dependencies.
185
+ """
186
+ result = df.copy()
187
+ dtypes = dtypes or {}
188
+
189
+ # Track provenance if enabled
190
+ if self._track_provenance:
191
+ if 'df_eval_provenance' not in result.attrs:
192
+ result.attrs['df_eval_provenance'] = {}
193
+
194
+ # Convert all to Expression objects and build dependency graph
195
+ expr_objects: Dict[str, Expression] = {}
196
+ for col_name, expr in schema.items():
197
+ if isinstance(expr, str):
198
+ expr_objects[col_name] = Expression(expr)
199
+ else:
200
+ expr_objects[col_name] = expr
201
+
202
+ # Perform topological sort
203
+ ordered_cols = self._topological_sort(expr_objects, set(result.columns))
204
+
205
+ # Evaluate in dependency order
206
+ for col_name in ordered_cols:
207
+ expr_obj = expr_objects[col_name]
208
+ dtype = dtypes.get(col_name)
209
+ result[col_name] = self.evaluate(result, expr_obj, dtype=dtype)
210
+
211
+ # Track provenance
212
+ if self._track_provenance:
213
+ result.attrs['df_eval_provenance'][col_name] = {
214
+ 'expression': expr_obj.expr_str,
215
+ 'dependencies': list(expr_obj.dependencies)
216
+ }
217
+
218
+ return result
219
+
220
+ def apply_operations(
221
+ self,
222
+ df: pd.DataFrame,
223
+ operations: Dict[str, Dict[str, Any]],
224
+ dtypes: Optional[Dict[str, str]] = None,
225
+ ) -> pd.DataFrame:
226
+ """Apply a set of operations (expr, lookup, function) to a DataFrame.
227
+
228
+ ``operations`` is a mapping from column name to a spec with keys::
229
+
230
+ {
231
+ "kind": "expr" | "lookup" | "function",
232
+ "expr": str | None,
233
+ "lookup": dict | None,
234
+ "function": dict | None,
235
+ }
236
+
237
+ This is intended to be used by higher-level integrations such as the
238
+ Pandera helpers, which translate column metadata into this structure.
239
+ """
240
+ result = df.copy()
241
+ dtypes = dtypes or {}
242
+
243
+ # Build Expression objects only for expr-kind operations so we can
244
+ # reuse the existing dependency analysis and topological sort.
245
+ expr_objects: Dict[str, Expression] = {}
246
+ for col_name, op in operations.items():
247
+ if op.get("kind") == "expr":
248
+ expr_str = op.get("expr")
249
+ expr_objects[col_name] = Expression(expr_str)
250
+
251
+ # Compute a dependency-aware ordering for *all* operation outputs by
252
+ # treating expr outputs as derived columns that must respect their
253
+ # dependencies (which may include lookup/function outputs).
254
+ # We start from the union of the current columns and all operation
255
+ # output columns so that the topological sort includes all expr nodes.
256
+ existing_cols = set(result.columns)
257
+ all_cols = existing_cols.union(expr_objects.keys())
258
+ ordered_expr_cols = self._topological_sort(expr_objects, existing_cols)
259
+
260
+ # Start with a stable order: first any non-expr operations in the
261
+ # dictionary order, then expr operations in dependency order.
262
+ ordered_ops: list[str] = []
263
+
264
+ # 1) Apply all lookup operations first so that any resolved columns
265
+ # (e.g. "price") are available to subsequent expressions and
266
+ # functions regardless of dictionary ordering.
267
+ for name, spec in operations.items():
268
+ if spec.get("kind") == "lookup":
269
+ ordered_ops.append(name)
270
+
271
+ # 2) Apply expr operations in dependency order. These may consume
272
+ # lookup-generated or original columns and produce intermediate
273
+ # columns (e.g. "line_total") that functions can depend on.
274
+ for col in ordered_expr_cols:
275
+ if col in operations:
276
+ ordered_ops.append(col)
277
+
278
+ # 3) Finally, apply function operations. Functions can depend on
279
+ # both expr outputs and lookup-generated columns.
280
+ for name, spec in operations.items():
281
+ if spec.get("kind") == "function":
282
+ ordered_ops.append(name)
283
+
284
+ # Apply operations in the computed order. Lookups and functions
285
+ # materialize their columns on ``result``, which may then be consumed
286
+ # by later expr operations or pipeline functions.
287
+ for col_name in ordered_ops:
288
+ op = operations[col_name]
289
+ kind = op.get("kind")
290
+
291
+ if kind == "lookup":
292
+ lookup_spec = op.get("lookup") or {}
293
+ series = self._apply_lookup_operation(result, lookup_spec)
294
+ result[col_name] = series
295
+
296
+ elif kind == "function":
297
+ func_spec = op.get("function") or {}
298
+ result = self._apply_pipeline_function(result, func_spec)
299
+
300
+ elif kind == "expr":
301
+ expr_obj = expr_objects[col_name]
302
+ dtype = dtypes.get(col_name)
303
+ result[col_name] = self.evaluate(result, expr_obj, dtype=dtype)
304
+
305
+ return result
306
+
307
+ # ------------------------------------------------------------------
308
+ # Metadata-driven pipeline helpers
309
+ # ------------------------------------------------------------------
310
+
311
+ def _apply_pipeline_function(self, df: pd.DataFrame, spec: Dict[str, Any]) -> pd.DataFrame:
312
+ """Apply a registered pipeline function according to a metadata spec.
313
+
314
+ The spec supports the following keys::
315
+
316
+ {
317
+ "name": "churn_model_v1",
318
+ "inputs": ["age", "tenure"], # optional; defaults to all columns
319
+ "outputs": ["churn_score"], # optional for DataFrame results
320
+ "params": {"region": "eu-west-1"}, # optional kwargs
321
+ }
322
+
323
+ The registered function is expected to accept a DataFrame (projected
324
+ to the specified input columns) and keyword arguments, and return
325
+ either a Series or DataFrame whose index aligns with ``df``.
326
+ """
327
+ name = spec["name"]
328
+ if name not in self.pipeline_functions:
329
+ raise ValueError(f"Unknown pipeline function '{name}' in metadata")
330
+
331
+ func = self.pipeline_functions[name]
332
+ inputs = spec.get("inputs")
333
+ outputs = spec.get("outputs")
334
+ params = spec.get("params", {})
335
+
336
+ input_df = df if inputs is None else df[inputs]
337
+ result = func(input_df, **params)
338
+
339
+ if isinstance(result, pd.Series):
340
+ if not outputs or len(outputs) != 1:
341
+ raise ValueError(
342
+ f"Pipeline function '{name}' returned a Series but "
343
+ "metadata did not specify exactly one output column"
344
+ )
345
+ col_name = outputs[0]
346
+ return df.assign(**{col_name: result})
347
+
348
+ if isinstance(result, pd.DataFrame):
349
+ if outputs is not None:
350
+ if len(outputs) != result.shape[1]:
351
+ raise ValueError(
352
+ f"Pipeline function '{name}' returned {result.shape[1]} "
353
+ f"columns but metadata specifies {len(outputs)} outputs"
354
+ )
355
+ result = result.set_axis(outputs, axis=1)
356
+ return df.join(result)
357
+
358
+ raise TypeError(
359
+ f"Pipeline function '{name}' must return a Series or DataFrame, "
360
+ f"got {type(result)!r}"
361
+ )
362
+
363
+ def _apply_lookup_operation(self, df: pd.DataFrame, spec: Dict[str, Any]) -> pd.Series:
364
+ """Apply a lookup operation described by metadata.
365
+
366
+ The spec supports the following keys::
367
+
368
+ {
369
+ "resolver": "prices", # name of registered resolver (preferred)
370
+ # or
371
+ "mapping": {"a": 1, "b": 2}, # inline mapping for small cases
372
+ "key": "product", # column providing lookup keys
373
+ "on_missing": "null", # "null" | "keep" | "raise"
374
+ }
375
+ """
376
+ key_col = spec["key"]
377
+ on_missing = spec.get("on_missing", "null")
378
+
379
+ if "resolver" in spec:
380
+ resolver_name = spec["resolver"]
381
+ try:
382
+ resolver = self.resolvers[resolver_name]
383
+ except KeyError as exc:
384
+ raise ValueError(
385
+ f"Unknown resolver '{resolver_name}' in lookup metadata"
386
+ ) from exc
387
+ elif "mapping" in spec:
388
+ from df_eval.lookup import DictResolver
389
+
390
+ resolver = DictResolver(spec["mapping"])
391
+ else:
392
+ raise ValueError("lookup metadata requires either 'resolver' or 'mapping'")
393
+
394
+ return _lookup(df[key_col], resolver, on_missing=on_missing)
395
+
396
+ def apply_pandera_schema(
397
+ self,
398
+ df: pd.DataFrame,
399
+ schema: Any,
400
+ **kwargs: Any,
401
+ ) -> pd.DataFrame:
402
+ """Apply a Pandera schema and derive df-eval columns from metadata.
403
+
404
+ This is a thin convenience wrapper around
405
+ ``df_eval.pandera.apply_pandera_schema`` that forwards the current
406
+ engine instance so registered functions/constants and provenance
407
+ settings are honored.
408
+ """
409
+ from df_eval.pandera import apply_pandera_schema
410
+
411
+ return apply_pandera_schema(df, schema, engine=self, **kwargs)
412
+
413
+ def iter_apply_schema_parquet_chunks(
414
+ self,
415
+ input_path: str | Path,
416
+ schema: Dict[str, str | Expression],
417
+ *,
418
+ dtypes: Optional[Dict[str, str]] = None,
419
+ chunk_size: int = 100_000,
420
+ input_columns: Sequence[str] | None = None,
421
+ output_columns: Sequence[str] | None = None,
422
+ ) -> Iterator[pd.DataFrame]:
423
+ """Yield transformed chunks from a Parquet file or dataset.
424
+
425
+ Args:
426
+ input_path: Source Parquet file or directory-backed dataset.
427
+ schema: Mapping of derived column names to expressions.
428
+ dtypes: Optional mapping of derived column names to pandas dtypes.
429
+ chunk_size: Maximum rows to scan and transform per chunk.
430
+ input_columns: Optional input column projection for scan efficiency.
431
+ output_columns: Optional ordered subset of output columns to keep.
432
+
433
+ Yields:
434
+ Transformed DataFrame chunks.
435
+ """
436
+ selected_output_columns = list(output_columns) if output_columns is not None else None
437
+
438
+ for chunk in iter_parquet_row_chunks(
439
+ input_path,
440
+ chunk_size=chunk_size,
441
+ columns=input_columns,
442
+ ):
443
+ transformed = self.apply_schema(chunk, schema, dtypes=dtypes)
444
+ if selected_output_columns is not None:
445
+ transformed = transformed.loc[:, selected_output_columns]
446
+ yield transformed
447
+
448
+ def apply_schema_parquet_to_df(
449
+ self,
450
+ input_path: str | Path,
451
+ schema: Dict[str, str | Expression],
452
+ *,
453
+ dtypes: Optional[Dict[str, str]] = None,
454
+ chunk_size: int = 100_000,
455
+ input_columns: Sequence[str] | None = None,
456
+ output_columns: Sequence[str] | None = None,
457
+ ) -> pd.DataFrame:
458
+ """Transform a Parquet dataset chunk-by-chunk and return one DataFrame.
459
+
460
+ Args:
461
+ input_path: Source Parquet file or directory-backed dataset.
462
+ schema: Mapping of derived column names to expressions.
463
+ dtypes: Optional mapping of derived column names to pandas dtypes.
464
+ chunk_size: Maximum rows to process per chunk.
465
+ input_columns: Optional input column projection for scan efficiency.
466
+ output_columns: Optional ordered subset of output columns to keep.
467
+
468
+ Returns:
469
+ A DataFrame containing all transformed rows. Returns an empty
470
+ DataFrame when the input yields no row chunks.
471
+ """
472
+ chunks = list(
473
+ self.iter_apply_schema_parquet_chunks(
474
+ input_path,
475
+ schema,
476
+ dtypes=dtypes,
477
+ chunk_size=chunk_size,
478
+ input_columns=input_columns,
479
+ output_columns=output_columns,
480
+ )
481
+ )
482
+ if not chunks:
483
+ return pd.DataFrame()
484
+ return pd.concat(chunks, ignore_index=True)
485
+
486
+ def apply_schema_parquet_to_parquet(
487
+ self,
488
+ input_path: str | Path,
489
+ output_path: str | Path,
490
+ schema: Dict[str, str | Expression],
491
+ *,
492
+ dtypes: Optional[Dict[str, str]] = None,
493
+ chunk_size: int = 100_000,
494
+ input_columns: Sequence[str] | None = None,
495
+ output_columns: Sequence[str] | None = None,
496
+ compression: str = "snappy",
497
+ ) -> Path:
498
+ """Transform a Parquet dataset chunk-by-chunk and write Parquet output.
499
+
500
+ This method is optimized for out-of-memory processing: source data is
501
+ streamed in row chunks, transformed with the same expression engine
502
+ used for in-memory DataFrames, and written incrementally to ``output_path``.
503
+
504
+ Args:
505
+ input_path: Source Parquet file or directory-backed dataset.
506
+ output_path: Destination Parquet file.
507
+ schema: Mapping of derived column names to expressions.
508
+ dtypes: Optional mapping of derived column names to pandas dtypes.
509
+ chunk_size: Maximum rows to process per chunk.
510
+ input_columns: Optional input column projection for scan efficiency.
511
+ output_columns: Optional ordered subset of output columns to keep.
512
+ compression: Parquet compression codec used for output.
513
+
514
+ Returns:
515
+ The normalized ``output_path``.
516
+ """
517
+ transformed_chunks = self.iter_apply_schema_parquet_chunks(
518
+ input_path,
519
+ schema,
520
+ dtypes=dtypes,
521
+ chunk_size=chunk_size,
522
+ input_columns=input_columns,
523
+ output_columns=output_columns,
524
+ )
525
+ return write_parquet_row_chunks(
526
+ transformed_chunks,
527
+ output_path,
528
+ compression=compression,
529
+ )
530
+
531
+ def apply_pandera_schema_parquet_to_parquet(
532
+ self,
533
+ input_path: str | Path,
534
+ output_path: str | Path,
535
+ schema: Any,
536
+ **kwargs: Any,
537
+ ) -> Path:
538
+ """Apply a Pandera schema to Parquet input and write Parquet output."""
539
+ from df_eval.pandera import apply_pandera_schema_parquet_to_parquet
540
+
541
+ return apply_pandera_schema_parquet_to_parquet(
542
+ input_path,
543
+ output_path,
544
+ schema,
545
+ engine=self,
546
+ **kwargs,
547
+ )
548
+
549
+ def _topological_sort(
550
+ self,
551
+ expressions: Dict[str, Expression],
552
+ existing_cols: Set[str]
553
+ ) -> List[str]:
554
+ """
555
+ Perform topological sort on expressions based on dependencies.
556
+
557
+ Args:
558
+ expressions: Dictionary of column names to Expression objects.
559
+ existing_cols: Set of existing column names in the DataFrame.
560
+
561
+ Returns:
562
+ List of column names in dependency order.
563
+
564
+ Raises:
565
+ CycleDetectedError: If a cycle is detected.
566
+ """
567
+ # Build dependency graph
568
+ # graph[A] = {B, C} means A depends on B and C (B and C must be evaluated first)
569
+ graph: Dict[str, Set[str]] = {}
570
+ in_degree: Dict[str, int] = {}
571
+
572
+ # Initialize all nodes with zero in-degree
573
+ for col_name in expressions:
574
+ in_degree[col_name] = 0
575
+ graph[col_name] = set()
576
+
577
+ # Build graph: for each column, record what it depends on
578
+ for col_name, expr in expressions.items():
579
+ # Only consider dependencies on other derived columns
580
+ deps = expr.dependencies & expressions.keys()
581
+ graph[col_name] = deps
582
+ # This column has incoming edges from each dependency
583
+ in_degree[col_name] = len(deps)
584
+
585
+ # Kahn's algorithm for topological sort
586
+ # Start with nodes that have no dependencies (in-degree = 0)
587
+ queue = [col for col, degree in in_degree.items() if degree == 0]
588
+ result = []
589
+
590
+ while queue:
591
+ # Sort for deterministic output
592
+ queue.sort()
593
+ node = queue.pop(0)
594
+ result.append(node)
595
+
596
+ # This node is evaluated, so check all other nodes
597
+ # If any depend on this node, reduce their in-degree
598
+ for other_col in expressions.keys():
599
+ if node in graph[other_col]:
600
+ in_degree[other_col] -= 1
601
+ if in_degree[other_col] == 0:
602
+ queue.append(other_col)
603
+
604
+ # Check for cycles
605
+ if len(result) != len(expressions):
606
+ remaining = set(expressions.keys()) - set(result)
607
+ raise CycleDetectedError(
608
+ f"Cycle detected in column dependencies: {remaining}"
609
+ )
610
+
611
+ return result