docworkspace 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {docworkspace-0.2.7 → docworkspace-0.2.9}/PKG-INFO +1 -1
  2. {docworkspace-0.2.7 → docworkspace-0.2.9}/pyproject.toml +1 -1
  3. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/__init__.py +3 -3
  4. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/node/__init__.py +2 -2
  5. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/node/core.py +143 -6
  6. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/node/io.py +4 -0
  7. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/analysis.py +14 -1
  8. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_fastapi_integration.py +2 -2
  9. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_node.py +37 -2
  10. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_node_io.py +174 -0
  11. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace.py +32 -0
  12. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace_io_absolute_paths.py +68 -0
  13. {docworkspace-0.2.7 → docworkspace-0.2.9}/uv.lock +1 -1
  14. {docworkspace-0.2.7 → docworkspace-0.2.9}/.github/workflows/ci.yml +0 -0
  15. {docworkspace-0.2.7 → docworkspace-0.2.9}/.github/workflows/release.yml +0 -0
  16. {docworkspace-0.2.7 → docworkspace-0.2.9}/.gitignore +0 -0
  17. {docworkspace-0.2.7 → docworkspace-0.2.9}/PUBLISH.md +0 -0
  18. {docworkspace-0.2.7 → docworkspace-0.2.9}/README.md +0 -0
  19. {docworkspace-0.2.7 → docworkspace-0.2.9}/pytest.ini +0 -0
  20. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/__init__.py +0 -0
  21. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/core.py +0 -0
  22. {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/io.py +0 -0
  23. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/conftest.py +0 -0
  24. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_simple_operations.py +0 -0
  25. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace_serialization_types.py +0 -0
  26. {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace_shim.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docworkspace
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: A workspace library for managing Polars dataframes with parent-child relationships and lazy evaluation
5
5
  Requires-Python: >=3.14
6
6
  Requires-Dist: polars-text>=0.1.6
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docworkspace"
3
- version = "0.2.7"
3
+ version = "0.2.9"
4
4
  description = "A workspace library for managing Polars dataframes with parent-child relationships and lazy evaluation"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.14"
@@ -4,8 +4,8 @@ Provides backward compatibility for original imports while exposing
4
4
  serialization, analysis, and graph helpers in dedicated submodules.
5
5
  """
6
6
 
7
- from .node import Node # package exposing Node
7
+ from .node import DerivedColumnMeta, Node # package exposing Node
8
8
  from .workspace import Workspace # shim -> workspace.core.Workspace
9
9
 
10
- __version__ = "0.2.7"
11
- __all__ = ["Workspace", "Node"]
10
+ __version__ = "0.2.9"
11
+ __all__ = ["Workspace", "Node", "DerivedColumnMeta"]
@@ -4,7 +4,7 @@ Operations are provided as instance methods on ``Node`` while persistence is
4
4
  handled by the dedicated ``docworkspace.node.io`` module.
5
5
  """
6
6
 
7
- from .core import Node
7
+ from .core import DerivedColumnMeta, Node
8
8
  from .io import dumps, from_dict, loads, to_dict
9
9
 
10
- __all__ = ["Node", "to_dict", "from_dict", "dumps", "loads"]
10
+ __all__ = ["Node", "DerivedColumnMeta", "to_dict", "from_dict", "dumps", "loads"]
@@ -8,7 +8,17 @@ from __future__ import annotations
8
8
 
9
9
  import uuid
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Sequence, cast
11
+ from typing import (
12
+ TYPE_CHECKING,
13
+ Any,
14
+ Dict,
15
+ Literal,
16
+ Mapping,
17
+ Optional,
18
+ Sequence,
19
+ TypedDict,
20
+ cast,
21
+ )
12
22
 
13
23
  import polars as pl
14
24
 
@@ -16,6 +26,23 @@ if TYPE_CHECKING: # pragma: no cover
16
26
  from ..workspace.core import Workspace # pragma: no cover
17
27
 
18
28
 
29
+ class DerivedColumnMeta(TypedDict):
30
+ """Metadata for a hidden derived analytic column (Phase 2, decision 7).
31
+
32
+ Derived columns live alongside the user's columns in the same LazyFrame
33
+ but are stripped from frontend-facing schema projections. ``source_column``
34
+ points at the originating user column; ``form`` says what kind of derivation
35
+ (``"tokens"``, future: ``"pos"``, ``"ner"``); ``model`` identifies the
36
+ backend that produced it (``"jieba"``, ``"bert-base-uncased"``, ...).
37
+ """
38
+
39
+ source_column: str
40
+ form: str
41
+ model: str
42
+ language: Optional[str]
43
+ generated_at: str
44
+
45
+
19
46
  class Node:
20
47
  MAX_UNDO_DEPTH = 50
21
48
 
@@ -33,6 +60,7 @@ class Node:
33
60
  operation: str | None = None,
34
61
  id: str | None = None,
35
62
  document: str | None = None,
63
+ derived: Optional[Mapping[str, DerivedColumnMeta]] = None,
36
64
  ) -> None:
37
65
  self.id = id or str(uuid.uuid4())
38
66
  self.name = name or f"node_{self.id[:8]}"
@@ -46,6 +74,14 @@ class Node:
46
74
  self._redo_stack: list[pl.LazyFrame] = []
47
75
  self._data: pl.LazyFrame = data
48
76
  self._document_column: Optional[str] = document
77
+ # Per-column metadata for hidden derived analytic columns (Phase 2,
78
+ # decision 7). Keys are the derived column names that exist in the
79
+ # LazyFrame schema (e.g. "__derived__.tokens.text.jieba"); values
80
+ # carry source_column / form / model / language / generated_at.
81
+ # Empty dict on legacy nodes is fully backward compatible.
82
+ self.derived: dict[str, DerivedColumnMeta] = (
83
+ {k: dict(v) for k, v in derived.items()} if derived else {}
84
+ ) # type: ignore[assignment]
49
85
  self.parents: list[Node | str] = list(parents)
50
86
  self.workspace: Optional[Workspace] = workspace
51
87
  self.operation = operation
@@ -75,6 +111,7 @@ class Node:
75
111
  workspace=self.workspace,
76
112
  parents=[self],
77
113
  operation=item,
114
+ derived=self.derived,
78
115
  )
79
116
  if self.document:
80
117
  child.document = self.document
@@ -138,16 +175,24 @@ class Node:
138
175
  workspace=self.workspace,
139
176
  parents=[self],
140
177
  operation="filter",
178
+ derived=self.derived,
141
179
  )
142
180
 
143
181
  def select(self, *exprs: Any, **named_exprs: Any) -> "Node":
144
182
  result = self.data.select(*exprs, **named_exprs)
183
+ # User-driven select may drop derived columns from the schema; keep
184
+ # only the derived metadata entries whose column still exists.
185
+ result_columns = set(result.collect_schema().names())
186
+ retained_derived = {
187
+ name: meta for name, meta in self.derived.items() if name in result_columns
188
+ }
145
189
  return Node(
146
190
  data=result,
147
191
  name=f"select_{self.name}",
148
192
  workspace=self.workspace,
149
193
  parents=[self],
150
194
  operation="select",
195
+ derived=retained_derived,
151
196
  )
152
197
 
153
198
  def join(
@@ -167,12 +212,21 @@ class Node:
167
212
  **kwargs: Any,
168
213
  ) -> "Node":
169
214
  result = self.data.join(other.data, on=on, how=how, **kwargs)
215
+ # Union derived metadata from both sides; result column set may drop
216
+ # entries if join columns collide, so filter to the resulting schema.
217
+ result_columns = set(result.collect_schema().names())
218
+ merged: dict[str, DerivedColumnMeta] = {}
219
+ for source in (self.derived, other.derived):
220
+ for name, meta in source.items():
221
+ if name in result_columns:
222
+ merged[name] = meta
170
223
  return Node(
171
224
  data=result,
172
225
  name=f"join_{self.name}_{other.name}",
173
226
  workspace=self.workspace,
174
227
  parents=[self, other],
175
228
  operation=f"join({how})",
229
+ derived=merged,
176
230
  )
177
231
 
178
232
  def slice(self, offset: int, length: int | None = None) -> "Node":
@@ -183,6 +237,7 @@ class Node:
183
237
  workspace=self.workspace,
184
238
  parents=[self],
185
239
  operation="slice",
240
+ derived=self.derived,
186
241
  )
187
242
 
188
243
  def drop(
@@ -194,19 +249,36 @@ class Node:
194
249
  """Drop columns using Polars semantics and return a child node.
195
250
 
196
251
  Mirrors ``polars.LazyFrame.drop`` while preserving DocWorkspace lineage.
252
+ Cascade rule (decision 7): when a user column is dropped, any derived
253
+ columns whose ``source_column`` matched are also dropped and removed
254
+ from ``Node.derived``.
197
255
  """
256
+ before_names = set(self.data.collect_schema().names())
198
257
  result = self.data.drop(columns, *more_columns, strict=strict)
258
+ after_names = set(result.collect_schema().names())
259
+ dropped_sources = before_names - after_names
260
+
261
+ cascade_targets: list[str] = []
262
+ retained_derived: dict[str, DerivedColumnMeta] = {}
263
+ for derived_name, meta in self.derived.items():
264
+ if meta["source_column"] in dropped_sources and derived_name in after_names:
265
+ cascade_targets.append(derived_name)
266
+ elif derived_name in after_names:
267
+ retained_derived[derived_name] = meta
268
+
269
+ if cascade_targets:
270
+ result = result.drop(*cascade_targets, strict=False)
271
+
199
272
  child = Node(
200
273
  data=result,
201
274
  name=f"drop_{self.name}",
202
275
  workspace=self.workspace,
203
276
  parents=[self],
204
277
  operation="drop",
278
+ derived=retained_derived,
205
279
  )
206
280
 
207
281
  if self.document:
208
- before_names = set(self.data.collect_schema().names())
209
- after_names = set(result.collect_schema().names())
210
282
  if self.document in before_names and self.document not in after_names:
211
283
  child.document = None
212
284
  else:
@@ -215,8 +287,30 @@ class Node:
215
287
  return child
216
288
 
217
289
  def rename(self, mapping: Any, *, strict: bool = True) -> "Node":
218
- """Rename columns in-place using Polars semantics and return this node."""
219
- self.data = self.data.rename(mapping, strict=strict)
290
+ """Rename columns in-place using Polars semantics and return this node.
291
+
292
+ Cascade rule (decision 7): renaming a source column makes any derived
293
+ columns referencing it stale — they are dropped from the LazyFrame and
294
+ from ``Node.derived``. Users can re-tokenise after the rename.
295
+ """
296
+ before_names = set(self.data.collect_schema().names())
297
+ new_data = self.data.rename(mapping, strict=strict)
298
+ after_names = set(new_data.collect_schema().names())
299
+ renamed_sources = before_names - after_names
300
+
301
+ if self.derived and renamed_sources:
302
+ cascade_targets = [
303
+ derived_name
304
+ for derived_name, meta in self.derived.items()
305
+ if meta["source_column"] in renamed_sources
306
+ and derived_name in after_names
307
+ ]
308
+ if cascade_targets:
309
+ new_data = new_data.drop(*cascade_targets, strict=False)
310
+ for name in cascade_targets:
311
+ self.derived.pop(name, None)
312
+
313
+ self.data = new_data
220
314
 
221
315
  if self.document:
222
316
  new_document = self.document
@@ -273,6 +367,49 @@ class Node:
273
367
  def can_redo(self) -> bool:
274
368
  return len(self._redo_stack) > 0
275
369
 
370
+ # ------------------------------------------------------------------
371
+ # Derived-column metadata (Phase 2, decision 7)
372
+ # ------------------------------------------------------------------
373
+ def register_derived_column(
374
+ self, column_name: str, meta: DerivedColumnMeta
375
+ ) -> None:
376
+ """Record metadata for a hidden derived column on this node.
377
+
378
+ Caller is responsible for ensuring ``column_name`` exists in the
379
+ node's LazyFrame schema (typically after a ``with_columns(...)`` that
380
+ adds it). This method only writes the metadata index.
381
+ """
382
+ self.derived[column_name] = dict(meta) # type: ignore[assignment]
383
+
384
+ def unregister_derived_column(self, column_name: str) -> bool:
385
+ """Remove the metadata entry for ``column_name``. Does not touch the
386
+ LazyFrame schema. Returns True if an entry was removed.
387
+ """
388
+ return self.derived.pop(column_name, None) is not None
389
+
390
+ def find_derived_column(
391
+ self,
392
+ source_column: str,
393
+ *,
394
+ form: str = "tokens",
395
+ model: str | None = None,
396
+ ) -> str | None:
397
+ """Return the name of a derived column for ``source_column``, or None.
398
+
399
+ Filters by ``form`` (default ``"tokens"``); if ``model`` is given,
400
+ further narrows to that backend. When multiple candidates match,
401
+ returns the first by insertion order.
402
+ """
403
+ for name, meta in self.derived.items():
404
+ if meta.get("source_column") != source_column:
405
+ continue
406
+ if meta.get("form") != form:
407
+ continue
408
+ if model is not None and meta.get("model") != model:
409
+ continue
410
+ return name
411
+ return None
412
+
276
413
  # ------------------------------------------------------------------
277
414
  # Schema utilities
278
415
  # ------------------------------------------------------------------
@@ -332,4 +469,4 @@ class Node:
332
469
  )
333
470
 
334
471
 
335
- __all__ = ["Node"]
472
+ __all__ = ["Node", "DerivedColumnMeta"]
@@ -47,6 +47,7 @@ def to_dict(node: Node, *, base_dir: str | Path | None = None) -> dict[str, Any]
47
47
  "name": node.name,
48
48
  "operation": node.operation,
49
49
  "document": node.document,
50
+ "derived": {name: dict(meta) for name, meta in node.derived.items()},
50
51
  "parents": [node._parent_id(parent) for parent in node.parents],
51
52
  },
52
53
  "data_path": rel_data_path.as_posix(),
@@ -64,6 +65,9 @@ def from_dict(
64
65
  node_metadata = dict(payload["node_metadata"])
65
66
  data_path = Path(str(payload["data_path"]))
66
67
  parent_ids = node_metadata.pop("parents", [])
68
+ # Legacy workspaces persisted before Phase 2 won't have ``derived`` at
69
+ # all; default to empty so loading stays backward compatible.
70
+ node_metadata.setdefault("derived", {})
67
71
 
68
72
  if workspace is not None:
69
73
  root_dir = Path(workspace.ws_root_dir)
@@ -34,7 +34,20 @@ def graph_json(workspace: "Workspace") -> Dict[str, object]:
34
34
  edges_payload: List[Dict[str, str]] = []
35
35
 
36
36
  for node in workspace.nodes.values():
37
- nodes_payload.append(node.info())
37
+ try:
38
+ nodes_payload.append(node.info())
39
+ except Exception as exc:
40
+ # Per-node fallback: one broken node (e.g. missing source file,
41
+ # undeserializable lazy plan) must not take down the whole graph.
42
+ nodes_payload.append(
43
+ {
44
+ "id": node.id,
45
+ "name": getattr(node, "name", node.id),
46
+ "operation": getattr(node, "operation", "unknown"),
47
+ "child_ids": [c.id for c in getattr(node, "children", [])],
48
+ "error": f"{type(exc).__name__}: {exc}",
49
+ }
50
+ )
38
51
 
39
52
  for child in node.children:
40
53
  edges_payload.append({"source": node.id, "target": child.id})
@@ -19,8 +19,8 @@ class TestCoreLibraryIndependence:
19
19
  """Test that core library only exports core functionality."""
20
20
  from docworkspace import __all__
21
21
 
22
- # Core library should only export Node and Workspace
23
- expected_exports = {"Node", "Workspace"}
22
+ # Core library should only export Node, Workspace, and their typed helpers
23
+ expected_exports = {"Node", "Workspace", "DerivedColumnMeta"}
24
24
  actual_exports = set(__all__)
25
25
 
26
26
  assert actual_exports == expected_exports, (
@@ -1,12 +1,12 @@
1
1
  """Tests for the Node class."""
2
2
 
3
3
  from inspect import signature
4
- from typing import Optional, Sequence, cast, get_type_hints
4
+ from typing import Mapping, Optional, Sequence, cast, get_type_hints
5
5
 
6
6
  import polars as pl
7
7
  import pytest
8
8
 
9
- from docworkspace import Node, Workspace
9
+ from docworkspace import DerivedColumnMeta, Node, Workspace
10
10
 
11
11
 
12
12
  class TestNode:
@@ -63,6 +63,8 @@ class TestNode:
63
63
  "Workspace": Workspace,
64
64
  "Sequence": Sequence,
65
65
  "Optional": Optional,
66
+ "Mapping": Mapping,
67
+ "DerivedColumnMeta": DerivedColumnMeta,
66
68
  "pl": pl,
67
69
  },
68
70
  )
@@ -400,3 +402,36 @@ class TestNodeRelationships:
400
402
  assert merged in parent1.children
401
403
  assert merged in parent2.children
402
404
  assert merged in parent2.children
405
+
406
+
407
+ def test_node_shape_does_not_materialise_list_columns():
408
+ """Phase 2.8: Node.shape must compute height without scanning list columns.
409
+
410
+ A future change that pushes Node.shape towards full collect() (e.g. via
411
+ .height instead of .select(pl.len())) would break tokenised nodes by
412
+ forcing the List[Struct] column to be materialised. Bound it loosely at
413
+ 100ms on a 50k-row tokenised-style frame.
414
+ """
415
+ import time
416
+
417
+ N = 50_000
418
+ tokens_per_doc = 30
419
+ tokens_struct = [
420
+ {"token": f"t{i}", "start": i * 5, "end": i * 5 + 4}
421
+ for i in range(tokens_per_doc)
422
+ ]
423
+ df = pl.DataFrame(
424
+ {
425
+ "text": [f"doc {i} " * 5 for i in range(N)],
426
+ "TOKENS_tokens": [tokens_struct] * N,
427
+ }
428
+ )
429
+ node = Node(data=df.lazy(), name="bench")
430
+
431
+ start = time.perf_counter()
432
+ shape = node.shape
433
+ elapsed = time.perf_counter() - start
434
+
435
+ assert shape == (N, 2)
436
+ # Generous bound — typical observed time on dev hardware is < 1ms.
437
+ assert elapsed < 0.1, f"Node.shape took {elapsed*1000:.1f}ms; suspect materialisation regression"
@@ -30,6 +30,7 @@ def test_node_to_dict_persists_lazyframe_payload(tmp_path: Path):
30
30
  "name": "root",
31
31
  "operation": "source",
32
32
  "document": "text",
33
+ "derived": {},
33
34
  "parents": [],
34
35
  },
35
36
  "data_path": f"data/{node.id}.plbin",
@@ -233,3 +234,176 @@ def test_node_from_dict_ignores_missing_parent_ids(tmp_path: Path):
233
234
 
234
235
  assert restored.parents == []
235
236
  assert restored.parents == []
237
+
238
+
239
+ def test_node_derived_metadata_round_trip(tmp_path: Path):
240
+ """Phase 2.4 v2: Node.derived survives to_dict / from_dict."""
241
+ workspace = Workspace("node_io_derived")
242
+ workspace.ws_root_dir = tmp_path
243
+ derived_name = "__derived__.tokens.text.jieba"
244
+ meta = {
245
+ "source_column": "text",
246
+ "form": "tokens",
247
+ "model": "jieba",
248
+ "language": "zh",
249
+ "generated_at": "2026-05-12T00:00:00+00:00",
250
+ }
251
+ node = workspace.add_node(
252
+ Node(
253
+ data=pl.DataFrame({"text": ["今天天气很好"]}).lazy(),
254
+ name="zh_root",
255
+ workspace=workspace,
256
+ operation="source",
257
+ derived={derived_name: meta}, # type: ignore[arg-type]
258
+ )
259
+ )
260
+ node.document = "text"
261
+
262
+ payload = to_dict(node, base_dir=tmp_path)
263
+ assert payload["node_metadata"]["derived"] == {derived_name: meta}
264
+
265
+ # Round-trip into a fresh workspace
266
+ workspace2 = Workspace("node_io_derived_loaded")
267
+ workspace2.ws_root_dir = tmp_path
268
+ restored = from_dict(payload, workspace=workspace2)
269
+ assert restored.derived == {derived_name: meta}
270
+ assert restored.find_derived_column("text") == derived_name
271
+ assert restored.find_derived_column("text", model="jieba") == derived_name
272
+ assert restored.find_derived_column("text", model="other-model") is None
273
+
274
+
275
+ def test_node_legacy_payload_without_derived_loads_with_empty_dict(
276
+ tmp_path: Path,
277
+ ):
278
+ """Backward compat: workspaces persisted before Phase 2 lacking ``derived``
279
+ must still load, defaulting it to an empty dict."""
280
+ workspace = Workspace("legacy_node_io")
281
+ workspace.ws_root_dir = tmp_path
282
+ node = workspace.add_node(
283
+ Node(
284
+ data=pl.DataFrame({"text": ["legacy"]}).lazy(),
285
+ name="legacy_root",
286
+ workspace=workspace,
287
+ operation="source",
288
+ )
289
+ )
290
+
291
+ # Build a "legacy" payload — strip the new field the way old files would.
292
+ payload = to_dict(node, base_dir=tmp_path)
293
+ legacy_metadata = dict(payload["node_metadata"])
294
+ legacy_metadata.pop("derived", None)
295
+ legacy_payload = {**payload, "node_metadata": legacy_metadata}
296
+
297
+ workspace2 = Workspace("legacy_loaded")
298
+ workspace2.ws_root_dir = tmp_path
299
+ restored = from_dict(legacy_payload, workspace=workspace2)
300
+ assert restored.derived == {}
301
+
302
+
303
+ def test_node_derived_propagates_through_getattr(tmp_path: Path):
304
+ """Phase 2.4 v2: Node.derived propagates to children spawned by delegated
305
+ LazyFrame methods (schema-preserving ops like .head / .sort)."""
306
+ workspace = Workspace("derive_propagate")
307
+ workspace.ws_root_dir = tmp_path
308
+ derived_name = "__derived__.tokens.text.jieba"
309
+ meta = {
310
+ "source_column": "text",
311
+ "form": "tokens",
312
+ "model": "jieba",
313
+ "language": "zh",
314
+ "generated_at": "2026-05-12T00:00:00+00:00",
315
+ }
316
+ parent = workspace.add_node(
317
+ Node(
318
+ data=pl.DataFrame({"text": ["a", "b", "c"]}).lazy(),
319
+ name="zh_parent",
320
+ workspace=workspace,
321
+ operation="source",
322
+ derived={derived_name: meta}, # type: ignore[arg-type]
323
+ )
324
+ )
325
+ parent.document = "text"
326
+ child = parent.head(2)
327
+ assert child.derived == {derived_name: meta}
328
+
329
+
330
+ def test_node_drop_cascades_derived_columns(tmp_path: Path):
331
+ """Decision 7: dropping a source column auto-drops any derived columns
332
+ that reference it (both schema and metadata)."""
333
+ workspace = Workspace("derived_drop_cascade")
334
+ workspace.ws_root_dir = tmp_path
335
+ parent_lf = pl.DataFrame(
336
+ {
337
+ "text": ["a", "b"],
338
+ "other": [1, 2],
339
+ "__derived__.tokens.text.jieba": [
340
+ [{"token": "a", "start": 0, "end": 1}],
341
+ [{"token": "b", "start": 0, "end": 1}],
342
+ ],
343
+ }
344
+ ).lazy()
345
+ meta = {
346
+ "source_column": "text",
347
+ "form": "tokens",
348
+ "model": "jieba",
349
+ "language": "zh",
350
+ "generated_at": "2026-05-12T00:00:00+00:00",
351
+ }
352
+ parent = workspace.add_node(
353
+ Node(
354
+ data=parent_lf,
355
+ name="parent",
356
+ workspace=workspace,
357
+ derived={"__derived__.tokens.text.jieba": meta}, # type: ignore[arg-type]
358
+ )
359
+ )
360
+
361
+ # Dropping an UNRELATED column does NOT cascade.
362
+ survivor = parent.drop("other")
363
+ assert "__derived__.tokens.text.jieba" in survivor.derived
364
+ assert "__derived__.tokens.text.jieba" in survivor.data.collect_schema().names()
365
+
366
+ # Dropping the SOURCE column cascades: the derived column disappears from
367
+ # both the LazyFrame schema and the metadata index.
368
+ cascaded = parent.drop("text")
369
+ after_names = cascaded.data.collect_schema().names()
370
+ assert "__derived__.tokens.text.jieba" not in after_names
371
+ assert "__derived__.tokens.text.jieba" not in cascaded.derived
372
+
373
+
374
+ def test_node_rename_cascades_derived_columns(tmp_path: Path):
375
+ """Decision 7: renaming a source column drops derived columns that
376
+ referenced it (they become stale; user can re-tokenise)."""
377
+ workspace = Workspace("derived_rename_cascade")
378
+ workspace.ws_root_dir = tmp_path
379
+ parent_lf = pl.DataFrame(
380
+ {
381
+ "text": ["a", "b"],
382
+ "__derived__.tokens.text.jieba": [
383
+ [{"token": "a", "start": 0, "end": 1}],
384
+ [{"token": "b", "start": 0, "end": 1}],
385
+ ],
386
+ }
387
+ ).lazy()
388
+ meta = {
389
+ "source_column": "text",
390
+ "form": "tokens",
391
+ "model": "jieba",
392
+ "language": "zh",
393
+ "generated_at": "2026-05-12T00:00:00+00:00",
394
+ }
395
+ node = workspace.add_node(
396
+ Node(
397
+ data=parent_lf,
398
+ name="rename_target",
399
+ workspace=workspace,
400
+ derived={"__derived__.tokens.text.jieba": meta}, # type: ignore[arg-type]
401
+ )
402
+ )
403
+
404
+ node.rename({"text": "body"})
405
+ after_names = node.data.collect_schema().names()
406
+ assert "body" in after_names
407
+ assert "text" not in after_names
408
+ assert "__derived__.tokens.text.jieba" not in after_names
409
+ assert node.derived == {}
@@ -431,6 +431,38 @@ class TestWorkspaceGraphOperations:
431
431
  for field in required_fields:
432
432
  assert field in node_data
433
433
 
434
+ def test_workspace_graph_survives_broken_node_info(self):
435
+ """One node failing `info()` must not break the whole graph payload."""
436
+ workspace = Workspace("graph_resilience")
437
+ good_node = Node(
438
+ data=pl.DataFrame({"x": [1, 2, 3]}).lazy(),
439
+ name="good",
440
+ workspace=workspace,
441
+ )
442
+ bad_node = Node(
443
+ data=pl.DataFrame({"y": [4, 5]}).lazy(),
444
+ name="bad",
445
+ workspace=workspace,
446
+ )
447
+
448
+ # Simulate a broken lazy plan / missing source file: info() raises.
449
+ def _boom() -> dict:
450
+ raise RuntimeError("source parquet missing")
451
+
452
+ bad_node.info = _boom # type: ignore[method-assign]
453
+
454
+ graph_data = workspace.graph_json()
455
+
456
+ nodes_by_id = {n["id"]: n for n in graph_data["nodes"]}
457
+ assert good_node.id in nodes_by_id
458
+ assert bad_node.id in nodes_by_id
459
+ # Healthy node still carries its real info.
460
+ assert "shape" in nodes_by_id[good_node.id]
461
+ # Broken node carries an error envelope plus identity fields.
462
+ assert nodes_by_id[bad_node.id]["name"] == "bad"
463
+ assert "error" in nodes_by_id[bad_node.id]
464
+ assert "RuntimeError" in nodes_by_id[bad_node.id]["error"]
465
+
434
466
  def test_workspace_with_initial_data_loading(self):
435
467
  """Test explicit initial data loading after creating an empty workspace."""
436
468
  # Test with DataFrame converted to LazyFrame before creating a Node.
@@ -156,3 +156,71 @@ def test_rebase_then_rename_then_save_keeps_parquet(tmp_path: Path):
156
156
  # Save (triggers GC) — the parquet must survive.
157
157
  ws2.save(folder_b)
158
158
  assert (folder_b / "data" / "my_data.parquet").exists()
159
+
160
+
161
+ def test_rebase_preserves_tokenized_node_after_move(tmp_path: Path):
162
+ """Phase 2.9 regression: a node with a List[Struct] tokens column must
163
+ survive workspace-folder move + rebase_workspace_sources. The rebasing
164
+ walks scan-source paths inside the plbin, not the dataframe schema, so
165
+ it should be schema-agnostic — this test locks that in."""
166
+
167
+ folder_a = tmp_path / "Tokens"
168
+ folder_a.mkdir()
169
+ data_dir = folder_a / "data"
170
+ data_dir.mkdir()
171
+
172
+ parquet_path = data_dir / "docs.parquet"
173
+ _make_parquet(parquet_path, pl.DataFrame({"text": ["doc one", "doc two"]}))
174
+
175
+ ws = Workspace(name="Tokens", ws_root_dir=folder_a)
176
+ base_node = Node(
177
+ data=pl.scan_parquet(parquet_path.resolve()),
178
+ name="docs",
179
+ )
180
+ ws.add_node(base_node)
181
+
182
+ # Synthesize a derived tokens column on top via with_columns (LazyFrame
183
+ # plan; represents what worker_tasks_tokenize will produce in Phase 2.3).
184
+ derived_name = "__derived__.tokens.text.jieba"
185
+ derived_meta = {
186
+ "source_column": "text",
187
+ "form": "tokens",
188
+ "model": "jieba",
189
+ "language": "zh",
190
+ "generated_at": "2026-05-12T00:00:00+00:00",
191
+ }
192
+ tokens_frame = base_node.data.with_columns(
193
+ pl.lit([{"token": "doc", "start": 0, "end": 3}, {"token": "one", "start": 4, "end": 7}])
194
+ .alias(derived_name)
195
+ )
196
+ tokens_node = Node(
197
+ data=tokens_frame,
198
+ name="docs_tokens",
199
+ parents=[base_node],
200
+ operation="tokenize",
201
+ derived={derived_name: derived_meta},
202
+ )
203
+ ws.add_node(tokens_node)
204
+ ws.save(folder_a)
205
+
206
+ # Move the workspace folder to a new location.
207
+ folder_b = tmp_path / "Tokens_Moved"
208
+ shutil.copytree(folder_a, folder_b)
209
+ shutil.rmtree(folder_a)
210
+
211
+ rebase_workspace_sources(folder_b)
212
+ ws2 = Workspace.load(folder_b)
213
+
214
+ # Both nodes should be back, and the tokens node's lineage + metadata
215
+ # preserved.
216
+ assert len(ws2.nodes) == 2
217
+ loaded_tokens_node = next(
218
+ n for n in ws2.nodes.values() if n.name == "docs_tokens"
219
+ )
220
+ assert loaded_tokens_node.derived == {derived_name: derived_meta}
221
+ assert loaded_tokens_node.operation == "tokenize"
222
+
223
+ # The List[Struct] column should still be loadable end-to-end.
224
+ collected = cast(pl.DataFrame, loaded_tokens_node.data.collect())
225
+ assert derived_name in collected.columns
226
+ assert collected.height == 2
@@ -13,7 +13,7 @@ wheels = [
13
13
 
14
14
  [[package]]
15
15
  name = "docworkspace"
16
- version = "0.2.7"
16
+ version = "0.2.9"
17
17
  source = { editable = "." }
18
18
  dependencies = [
19
19
  { name = "polars-text" },
File without changes
File without changes
File without changes
File without changes