docworkspace 0.2.7__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docworkspace-0.2.7 → docworkspace-0.2.9}/PKG-INFO +1 -1
- {docworkspace-0.2.7 → docworkspace-0.2.9}/pyproject.toml +1 -1
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/__init__.py +3 -3
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/node/__init__.py +2 -2
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/node/core.py +143 -6
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/node/io.py +4 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/analysis.py +14 -1
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_fastapi_integration.py +2 -2
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_node.py +37 -2
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_node_io.py +174 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace.py +32 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace_io_absolute_paths.py +68 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/uv.lock +1 -1
- {docworkspace-0.2.7 → docworkspace-0.2.9}/.github/workflows/ci.yml +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/.github/workflows/release.yml +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/.gitignore +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/PUBLISH.md +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/README.md +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/pytest.ini +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/__init__.py +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/core.py +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/src/docworkspace/workspace/io.py +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/conftest.py +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_simple_operations.py +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace_serialization_types.py +0 -0
- {docworkspace-0.2.7 → docworkspace-0.2.9}/tests/test_workspace_shim.py +0 -0
|
@@ -4,8 +4,8 @@ Provides backward compatibility for original imports while exposing
|
|
|
4
4
|
serialization, analysis, and graph helpers in dedicated submodules.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from .node import Node # package exposing Node
|
|
7
|
+
from .node import DerivedColumnMeta, Node # package exposing Node
|
|
8
8
|
from .workspace import Workspace # shim -> workspace.core.Workspace
|
|
9
9
|
|
|
10
|
-
__version__ = "0.2.
|
|
11
|
-
__all__ = ["Workspace", "Node"]
|
|
10
|
+
__version__ = "0.2.9"
|
|
11
|
+
__all__ = ["Workspace", "Node", "DerivedColumnMeta"]
|
|
@@ -4,7 +4,7 @@ Operations are provided as instance methods on ``Node`` while persistence is
|
|
|
4
4
|
handled by the dedicated ``docworkspace.node.io`` module.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from .core import Node
|
|
7
|
+
from .core import DerivedColumnMeta, Node
|
|
8
8
|
from .io import dumps, from_dict, loads, to_dict
|
|
9
9
|
|
|
10
|
-
__all__ = ["Node", "to_dict", "from_dict", "dumps", "loads"]
|
|
10
|
+
__all__ = ["Node", "DerivedColumnMeta", "to_dict", "from_dict", "dumps", "loads"]
|
|
@@ -8,7 +8,17 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import uuid
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import (
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Any,
|
|
14
|
+
Dict,
|
|
15
|
+
Literal,
|
|
16
|
+
Mapping,
|
|
17
|
+
Optional,
|
|
18
|
+
Sequence,
|
|
19
|
+
TypedDict,
|
|
20
|
+
cast,
|
|
21
|
+
)
|
|
12
22
|
|
|
13
23
|
import polars as pl
|
|
14
24
|
|
|
@@ -16,6 +26,23 @@ if TYPE_CHECKING: # pragma: no cover
|
|
|
16
26
|
from ..workspace.core import Workspace # pragma: no cover
|
|
17
27
|
|
|
18
28
|
|
|
29
|
+
class DerivedColumnMeta(TypedDict):
|
|
30
|
+
"""Metadata for a hidden derived analytic column (Phase 2, decision 7).
|
|
31
|
+
|
|
32
|
+
Derived columns live alongside the user's columns in the same LazyFrame
|
|
33
|
+
but are stripped from frontend-facing schema projections. ``source_column``
|
|
34
|
+
points at the originating user column; ``form`` says what kind of derivation
|
|
35
|
+
(``"tokens"``, future: ``"pos"``, ``"ner"``); ``model`` identifies the
|
|
36
|
+
backend that produced it (``"jieba"``, ``"bert-base-uncased"``, ...).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
source_column: str
|
|
40
|
+
form: str
|
|
41
|
+
model: str
|
|
42
|
+
language: Optional[str]
|
|
43
|
+
generated_at: str
|
|
44
|
+
|
|
45
|
+
|
|
19
46
|
class Node:
|
|
20
47
|
MAX_UNDO_DEPTH = 50
|
|
21
48
|
|
|
@@ -33,6 +60,7 @@ class Node:
|
|
|
33
60
|
operation: str | None = None,
|
|
34
61
|
id: str | None = None,
|
|
35
62
|
document: str | None = None,
|
|
63
|
+
derived: Optional[Mapping[str, DerivedColumnMeta]] = None,
|
|
36
64
|
) -> None:
|
|
37
65
|
self.id = id or str(uuid.uuid4())
|
|
38
66
|
self.name = name or f"node_{self.id[:8]}"
|
|
@@ -46,6 +74,14 @@ class Node:
|
|
|
46
74
|
self._redo_stack: list[pl.LazyFrame] = []
|
|
47
75
|
self._data: pl.LazyFrame = data
|
|
48
76
|
self._document_column: Optional[str] = document
|
|
77
|
+
# Per-column metadata for hidden derived analytic columns (Phase 2,
|
|
78
|
+
# decision 7). Keys are the derived column names that exist in the
|
|
79
|
+
# LazyFrame schema (e.g. "__derived__.tokens.text.jieba"); values
|
|
80
|
+
# carry source_column / form / model / language / generated_at.
|
|
81
|
+
# Empty dict on legacy nodes is fully backward compatible.
|
|
82
|
+
self.derived: dict[str, DerivedColumnMeta] = (
|
|
83
|
+
{k: dict(v) for k, v in derived.items()} if derived else {}
|
|
84
|
+
) # type: ignore[assignment]
|
|
49
85
|
self.parents: list[Node | str] = list(parents)
|
|
50
86
|
self.workspace: Optional[Workspace] = workspace
|
|
51
87
|
self.operation = operation
|
|
@@ -75,6 +111,7 @@ class Node:
|
|
|
75
111
|
workspace=self.workspace,
|
|
76
112
|
parents=[self],
|
|
77
113
|
operation=item,
|
|
114
|
+
derived=self.derived,
|
|
78
115
|
)
|
|
79
116
|
if self.document:
|
|
80
117
|
child.document = self.document
|
|
@@ -138,16 +175,24 @@ class Node:
|
|
|
138
175
|
workspace=self.workspace,
|
|
139
176
|
parents=[self],
|
|
140
177
|
operation="filter",
|
|
178
|
+
derived=self.derived,
|
|
141
179
|
)
|
|
142
180
|
|
|
143
181
|
def select(self, *exprs: Any, **named_exprs: Any) -> "Node":
|
|
144
182
|
result = self.data.select(*exprs, **named_exprs)
|
|
183
|
+
# User-driven select may drop derived columns from the schema; keep
|
|
184
|
+
# only the derived metadata entries whose column still exists.
|
|
185
|
+
result_columns = set(result.collect_schema().names())
|
|
186
|
+
retained_derived = {
|
|
187
|
+
name: meta for name, meta in self.derived.items() if name in result_columns
|
|
188
|
+
}
|
|
145
189
|
return Node(
|
|
146
190
|
data=result,
|
|
147
191
|
name=f"select_{self.name}",
|
|
148
192
|
workspace=self.workspace,
|
|
149
193
|
parents=[self],
|
|
150
194
|
operation="select",
|
|
195
|
+
derived=retained_derived,
|
|
151
196
|
)
|
|
152
197
|
|
|
153
198
|
def join(
|
|
@@ -167,12 +212,21 @@ class Node:
|
|
|
167
212
|
**kwargs: Any,
|
|
168
213
|
) -> "Node":
|
|
169
214
|
result = self.data.join(other.data, on=on, how=how, **kwargs)
|
|
215
|
+
# Union derived metadata from both sides; result column set may drop
|
|
216
|
+
# entries if join columns collide, so filter to the resulting schema.
|
|
217
|
+
result_columns = set(result.collect_schema().names())
|
|
218
|
+
merged: dict[str, DerivedColumnMeta] = {}
|
|
219
|
+
for source in (self.derived, other.derived):
|
|
220
|
+
for name, meta in source.items():
|
|
221
|
+
if name in result_columns:
|
|
222
|
+
merged[name] = meta
|
|
170
223
|
return Node(
|
|
171
224
|
data=result,
|
|
172
225
|
name=f"join_{self.name}_{other.name}",
|
|
173
226
|
workspace=self.workspace,
|
|
174
227
|
parents=[self, other],
|
|
175
228
|
operation=f"join({how})",
|
|
229
|
+
derived=merged,
|
|
176
230
|
)
|
|
177
231
|
|
|
178
232
|
def slice(self, offset: int, length: int | None = None) -> "Node":
|
|
@@ -183,6 +237,7 @@ class Node:
|
|
|
183
237
|
workspace=self.workspace,
|
|
184
238
|
parents=[self],
|
|
185
239
|
operation="slice",
|
|
240
|
+
derived=self.derived,
|
|
186
241
|
)
|
|
187
242
|
|
|
188
243
|
def drop(
|
|
@@ -194,19 +249,36 @@ class Node:
|
|
|
194
249
|
"""Drop columns using Polars semantics and return a child node.
|
|
195
250
|
|
|
196
251
|
Mirrors ``polars.LazyFrame.drop`` while preserving DocWorkspace lineage.
|
|
252
|
+
Cascade rule (decision 7): when a user column is dropped, any derived
|
|
253
|
+
columns whose ``source_column`` matched are also dropped and removed
|
|
254
|
+
from ``Node.derived``.
|
|
197
255
|
"""
|
|
256
|
+
before_names = set(self.data.collect_schema().names())
|
|
198
257
|
result = self.data.drop(columns, *more_columns, strict=strict)
|
|
258
|
+
after_names = set(result.collect_schema().names())
|
|
259
|
+
dropped_sources = before_names - after_names
|
|
260
|
+
|
|
261
|
+
cascade_targets: list[str] = []
|
|
262
|
+
retained_derived: dict[str, DerivedColumnMeta] = {}
|
|
263
|
+
for derived_name, meta in self.derived.items():
|
|
264
|
+
if meta["source_column"] in dropped_sources and derived_name in after_names:
|
|
265
|
+
cascade_targets.append(derived_name)
|
|
266
|
+
elif derived_name in after_names:
|
|
267
|
+
retained_derived[derived_name] = meta
|
|
268
|
+
|
|
269
|
+
if cascade_targets:
|
|
270
|
+
result = result.drop(*cascade_targets, strict=False)
|
|
271
|
+
|
|
199
272
|
child = Node(
|
|
200
273
|
data=result,
|
|
201
274
|
name=f"drop_{self.name}",
|
|
202
275
|
workspace=self.workspace,
|
|
203
276
|
parents=[self],
|
|
204
277
|
operation="drop",
|
|
278
|
+
derived=retained_derived,
|
|
205
279
|
)
|
|
206
280
|
|
|
207
281
|
if self.document:
|
|
208
|
-
before_names = set(self.data.collect_schema().names())
|
|
209
|
-
after_names = set(result.collect_schema().names())
|
|
210
282
|
if self.document in before_names and self.document not in after_names:
|
|
211
283
|
child.document = None
|
|
212
284
|
else:
|
|
@@ -215,8 +287,30 @@ class Node:
|
|
|
215
287
|
return child
|
|
216
288
|
|
|
217
289
|
def rename(self, mapping: Any, *, strict: bool = True) -> "Node":
|
|
218
|
-
"""Rename columns in-place using Polars semantics and return this node.
|
|
219
|
-
|
|
290
|
+
"""Rename columns in-place using Polars semantics and return this node.
|
|
291
|
+
|
|
292
|
+
Cascade rule (decision 7): renaming a source column makes any derived
|
|
293
|
+
columns referencing it stale — they are dropped from the LazyFrame and
|
|
294
|
+
from ``Node.derived``. Users can re-tokenise after the rename.
|
|
295
|
+
"""
|
|
296
|
+
before_names = set(self.data.collect_schema().names())
|
|
297
|
+
new_data = self.data.rename(mapping, strict=strict)
|
|
298
|
+
after_names = set(new_data.collect_schema().names())
|
|
299
|
+
renamed_sources = before_names - after_names
|
|
300
|
+
|
|
301
|
+
if self.derived and renamed_sources:
|
|
302
|
+
cascade_targets = [
|
|
303
|
+
derived_name
|
|
304
|
+
for derived_name, meta in self.derived.items()
|
|
305
|
+
if meta["source_column"] in renamed_sources
|
|
306
|
+
and derived_name in after_names
|
|
307
|
+
]
|
|
308
|
+
if cascade_targets:
|
|
309
|
+
new_data = new_data.drop(*cascade_targets, strict=False)
|
|
310
|
+
for name in cascade_targets:
|
|
311
|
+
self.derived.pop(name, None)
|
|
312
|
+
|
|
313
|
+
self.data = new_data
|
|
220
314
|
|
|
221
315
|
if self.document:
|
|
222
316
|
new_document = self.document
|
|
@@ -273,6 +367,49 @@ class Node:
|
|
|
273
367
|
def can_redo(self) -> bool:
|
|
274
368
|
return len(self._redo_stack) > 0
|
|
275
369
|
|
|
370
|
+
# ------------------------------------------------------------------
|
|
371
|
+
# Derived-column metadata (Phase 2, decision 7)
|
|
372
|
+
# ------------------------------------------------------------------
|
|
373
|
+
def register_derived_column(
|
|
374
|
+
self, column_name: str, meta: DerivedColumnMeta
|
|
375
|
+
) -> None:
|
|
376
|
+
"""Record metadata for a hidden derived column on this node.
|
|
377
|
+
|
|
378
|
+
Caller is responsible for ensuring ``column_name`` exists in the
|
|
379
|
+
node's LazyFrame schema (typically after a ``with_columns(...)`` that
|
|
380
|
+
adds it). This method only writes the metadata index.
|
|
381
|
+
"""
|
|
382
|
+
self.derived[column_name] = dict(meta) # type: ignore[assignment]
|
|
383
|
+
|
|
384
|
+
def unregister_derived_column(self, column_name: str) -> bool:
|
|
385
|
+
"""Remove the metadata entry for ``column_name``. Does not touch the
|
|
386
|
+
LazyFrame schema. Returns True if an entry was removed.
|
|
387
|
+
"""
|
|
388
|
+
return self.derived.pop(column_name, None) is not None
|
|
389
|
+
|
|
390
|
+
def find_derived_column(
|
|
391
|
+
self,
|
|
392
|
+
source_column: str,
|
|
393
|
+
*,
|
|
394
|
+
form: str = "tokens",
|
|
395
|
+
model: str | None = None,
|
|
396
|
+
) -> str | None:
|
|
397
|
+
"""Return the name of a derived column for ``source_column``, or None.
|
|
398
|
+
|
|
399
|
+
Filters by ``form`` (default ``"tokens"``); if ``model`` is given,
|
|
400
|
+
further narrows to that backend. When multiple candidates match,
|
|
401
|
+
returns the first by insertion order.
|
|
402
|
+
"""
|
|
403
|
+
for name, meta in self.derived.items():
|
|
404
|
+
if meta.get("source_column") != source_column:
|
|
405
|
+
continue
|
|
406
|
+
if meta.get("form") != form:
|
|
407
|
+
continue
|
|
408
|
+
if model is not None and meta.get("model") != model:
|
|
409
|
+
continue
|
|
410
|
+
return name
|
|
411
|
+
return None
|
|
412
|
+
|
|
276
413
|
# ------------------------------------------------------------------
|
|
277
414
|
# Schema utilities
|
|
278
415
|
# ------------------------------------------------------------------
|
|
@@ -332,4 +469,4 @@ class Node:
|
|
|
332
469
|
)
|
|
333
470
|
|
|
334
471
|
|
|
335
|
-
__all__ = ["Node"]
|
|
472
|
+
__all__ = ["Node", "DerivedColumnMeta"]
|
|
@@ -47,6 +47,7 @@ def to_dict(node: Node, *, base_dir: str | Path | None = None) -> dict[str, Any]
|
|
|
47
47
|
"name": node.name,
|
|
48
48
|
"operation": node.operation,
|
|
49
49
|
"document": node.document,
|
|
50
|
+
"derived": {name: dict(meta) for name, meta in node.derived.items()},
|
|
50
51
|
"parents": [node._parent_id(parent) for parent in node.parents],
|
|
51
52
|
},
|
|
52
53
|
"data_path": rel_data_path.as_posix(),
|
|
@@ -64,6 +65,9 @@ def from_dict(
|
|
|
64
65
|
node_metadata = dict(payload["node_metadata"])
|
|
65
66
|
data_path = Path(str(payload["data_path"]))
|
|
66
67
|
parent_ids = node_metadata.pop("parents", [])
|
|
68
|
+
# Legacy workspaces persisted before Phase 2 won't have ``derived`` at
|
|
69
|
+
# all; default to empty so loading stays backward compatible.
|
|
70
|
+
node_metadata.setdefault("derived", {})
|
|
67
71
|
|
|
68
72
|
if workspace is not None:
|
|
69
73
|
root_dir = Path(workspace.ws_root_dir)
|
|
@@ -34,7 +34,20 @@ def graph_json(workspace: "Workspace") -> Dict[str, object]:
|
|
|
34
34
|
edges_payload: List[Dict[str, str]] = []
|
|
35
35
|
|
|
36
36
|
for node in workspace.nodes.values():
|
|
37
|
-
|
|
37
|
+
try:
|
|
38
|
+
nodes_payload.append(node.info())
|
|
39
|
+
except Exception as exc:
|
|
40
|
+
# Per-node fallback: one broken node (e.g. missing source file,
|
|
41
|
+
# undeserializable lazy plan) must not take down the whole graph.
|
|
42
|
+
nodes_payload.append(
|
|
43
|
+
{
|
|
44
|
+
"id": node.id,
|
|
45
|
+
"name": getattr(node, "name", node.id),
|
|
46
|
+
"operation": getattr(node, "operation", "unknown"),
|
|
47
|
+
"child_ids": [c.id for c in getattr(node, "children", [])],
|
|
48
|
+
"error": f"{type(exc).__name__}: {exc}",
|
|
49
|
+
}
|
|
50
|
+
)
|
|
38
51
|
|
|
39
52
|
for child in node.children:
|
|
40
53
|
edges_payload.append({"source": node.id, "target": child.id})
|
|
@@ -19,8 +19,8 @@ class TestCoreLibraryIndependence:
|
|
|
19
19
|
"""Test that core library only exports core functionality."""
|
|
20
20
|
from docworkspace import __all__
|
|
21
21
|
|
|
22
|
-
# Core library should only export Node and
|
|
23
|
-
expected_exports = {"Node", "Workspace"}
|
|
22
|
+
# Core library should only export Node, Workspace, and their typed helpers
|
|
23
|
+
expected_exports = {"Node", "Workspace", "DerivedColumnMeta"}
|
|
24
24
|
actual_exports = set(__all__)
|
|
25
25
|
|
|
26
26
|
assert actual_exports == expected_exports, (
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
"""Tests for the Node class."""
|
|
2
2
|
|
|
3
3
|
from inspect import signature
|
|
4
|
-
from typing import Optional, Sequence, cast, get_type_hints
|
|
4
|
+
from typing import Mapping, Optional, Sequence, cast, get_type_hints
|
|
5
5
|
|
|
6
6
|
import polars as pl
|
|
7
7
|
import pytest
|
|
8
8
|
|
|
9
|
-
from docworkspace import Node, Workspace
|
|
9
|
+
from docworkspace import DerivedColumnMeta, Node, Workspace
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class TestNode:
|
|
@@ -63,6 +63,8 @@ class TestNode:
|
|
|
63
63
|
"Workspace": Workspace,
|
|
64
64
|
"Sequence": Sequence,
|
|
65
65
|
"Optional": Optional,
|
|
66
|
+
"Mapping": Mapping,
|
|
67
|
+
"DerivedColumnMeta": DerivedColumnMeta,
|
|
66
68
|
"pl": pl,
|
|
67
69
|
},
|
|
68
70
|
)
|
|
@@ -400,3 +402,36 @@ class TestNodeRelationships:
|
|
|
400
402
|
assert merged in parent1.children
|
|
401
403
|
assert merged in parent2.children
|
|
402
404
|
assert merged in parent2.children
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def test_node_shape_does_not_materialise_list_columns():
|
|
408
|
+
"""Phase 2.8: Node.shape must compute height without scanning list columns.
|
|
409
|
+
|
|
410
|
+
A future change that pushes Node.shape towards full collect() (e.g. via
|
|
411
|
+
.height instead of .select(pl.len())) would break tokenised nodes by
|
|
412
|
+
forcing the List[Struct] column to be materialised. Bound it loosely at
|
|
413
|
+
100ms on a 50k-row tokenised-style frame.
|
|
414
|
+
"""
|
|
415
|
+
import time
|
|
416
|
+
|
|
417
|
+
N = 50_000
|
|
418
|
+
tokens_per_doc = 30
|
|
419
|
+
tokens_struct = [
|
|
420
|
+
{"token": f"t{i}", "start": i * 5, "end": i * 5 + 4}
|
|
421
|
+
for i in range(tokens_per_doc)
|
|
422
|
+
]
|
|
423
|
+
df = pl.DataFrame(
|
|
424
|
+
{
|
|
425
|
+
"text": [f"doc {i} " * 5 for i in range(N)],
|
|
426
|
+
"TOKENS_tokens": [tokens_struct] * N,
|
|
427
|
+
}
|
|
428
|
+
)
|
|
429
|
+
node = Node(data=df.lazy(), name="bench")
|
|
430
|
+
|
|
431
|
+
start = time.perf_counter()
|
|
432
|
+
shape = node.shape
|
|
433
|
+
elapsed = time.perf_counter() - start
|
|
434
|
+
|
|
435
|
+
assert shape == (N, 2)
|
|
436
|
+
# Generous bound — typical observed time on dev hardware is < 1ms.
|
|
437
|
+
assert elapsed < 0.1, f"Node.shape took {elapsed*1000:.1f}ms; suspect materialisation regression"
|
|
@@ -30,6 +30,7 @@ def test_node_to_dict_persists_lazyframe_payload(tmp_path: Path):
|
|
|
30
30
|
"name": "root",
|
|
31
31
|
"operation": "source",
|
|
32
32
|
"document": "text",
|
|
33
|
+
"derived": {},
|
|
33
34
|
"parents": [],
|
|
34
35
|
},
|
|
35
36
|
"data_path": f"data/{node.id}.plbin",
|
|
@@ -233,3 +234,176 @@ def test_node_from_dict_ignores_missing_parent_ids(tmp_path: Path):
|
|
|
233
234
|
|
|
234
235
|
assert restored.parents == []
|
|
235
236
|
assert restored.parents == []
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def test_node_derived_metadata_round_trip(tmp_path: Path):
|
|
240
|
+
"""Phase 2.4 v2: Node.derived survives to_dict / from_dict."""
|
|
241
|
+
workspace = Workspace("node_io_derived")
|
|
242
|
+
workspace.ws_root_dir = tmp_path
|
|
243
|
+
derived_name = "__derived__.tokens.text.jieba"
|
|
244
|
+
meta = {
|
|
245
|
+
"source_column": "text",
|
|
246
|
+
"form": "tokens",
|
|
247
|
+
"model": "jieba",
|
|
248
|
+
"language": "zh",
|
|
249
|
+
"generated_at": "2026-05-12T00:00:00+00:00",
|
|
250
|
+
}
|
|
251
|
+
node = workspace.add_node(
|
|
252
|
+
Node(
|
|
253
|
+
data=pl.DataFrame({"text": ["今天天气很好"]}).lazy(),
|
|
254
|
+
name="zh_root",
|
|
255
|
+
workspace=workspace,
|
|
256
|
+
operation="source",
|
|
257
|
+
derived={derived_name: meta}, # type: ignore[arg-type]
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
node.document = "text"
|
|
261
|
+
|
|
262
|
+
payload = to_dict(node, base_dir=tmp_path)
|
|
263
|
+
assert payload["node_metadata"]["derived"] == {derived_name: meta}
|
|
264
|
+
|
|
265
|
+
# Round-trip into a fresh workspace
|
|
266
|
+
workspace2 = Workspace("node_io_derived_loaded")
|
|
267
|
+
workspace2.ws_root_dir = tmp_path
|
|
268
|
+
restored = from_dict(payload, workspace=workspace2)
|
|
269
|
+
assert restored.derived == {derived_name: meta}
|
|
270
|
+
assert restored.find_derived_column("text") == derived_name
|
|
271
|
+
assert restored.find_derived_column("text", model="jieba") == derived_name
|
|
272
|
+
assert restored.find_derived_column("text", model="other-model") is None
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def test_node_legacy_payload_without_derived_loads_with_empty_dict(
|
|
276
|
+
tmp_path: Path,
|
|
277
|
+
):
|
|
278
|
+
"""Backward compat: workspaces persisted before Phase 2 lacking ``derived``
|
|
279
|
+
must still load, defaulting it to an empty dict."""
|
|
280
|
+
workspace = Workspace("legacy_node_io")
|
|
281
|
+
workspace.ws_root_dir = tmp_path
|
|
282
|
+
node = workspace.add_node(
|
|
283
|
+
Node(
|
|
284
|
+
data=pl.DataFrame({"text": ["legacy"]}).lazy(),
|
|
285
|
+
name="legacy_root",
|
|
286
|
+
workspace=workspace,
|
|
287
|
+
operation="source",
|
|
288
|
+
)
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Build a "legacy" payload — strip the new field the way old files would.
|
|
292
|
+
payload = to_dict(node, base_dir=tmp_path)
|
|
293
|
+
legacy_metadata = dict(payload["node_metadata"])
|
|
294
|
+
legacy_metadata.pop("derived", None)
|
|
295
|
+
legacy_payload = {**payload, "node_metadata": legacy_metadata}
|
|
296
|
+
|
|
297
|
+
workspace2 = Workspace("legacy_loaded")
|
|
298
|
+
workspace2.ws_root_dir = tmp_path
|
|
299
|
+
restored = from_dict(legacy_payload, workspace=workspace2)
|
|
300
|
+
assert restored.derived == {}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def test_node_derived_propagates_through_getattr(tmp_path: Path):
|
|
304
|
+
"""Phase 2.4 v2: Node.derived propagates to children spawned by delegated
|
|
305
|
+
LazyFrame methods (schema-preserving ops like .head / .sort)."""
|
|
306
|
+
workspace = Workspace("derive_propagate")
|
|
307
|
+
workspace.ws_root_dir = tmp_path
|
|
308
|
+
derived_name = "__derived__.tokens.text.jieba"
|
|
309
|
+
meta = {
|
|
310
|
+
"source_column": "text",
|
|
311
|
+
"form": "tokens",
|
|
312
|
+
"model": "jieba",
|
|
313
|
+
"language": "zh",
|
|
314
|
+
"generated_at": "2026-05-12T00:00:00+00:00",
|
|
315
|
+
}
|
|
316
|
+
parent = workspace.add_node(
|
|
317
|
+
Node(
|
|
318
|
+
data=pl.DataFrame({"text": ["a", "b", "c"]}).lazy(),
|
|
319
|
+
name="zh_parent",
|
|
320
|
+
workspace=workspace,
|
|
321
|
+
operation="source",
|
|
322
|
+
derived={derived_name: meta}, # type: ignore[arg-type]
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
parent.document = "text"
|
|
326
|
+
child = parent.head(2)
|
|
327
|
+
assert child.derived == {derived_name: meta}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def test_node_drop_cascades_derived_columns(tmp_path: Path):
|
|
331
|
+
"""Decision 7: dropping a source column auto-drops any derived columns
|
|
332
|
+
that reference it (both schema and metadata)."""
|
|
333
|
+
workspace = Workspace("derived_drop_cascade")
|
|
334
|
+
workspace.ws_root_dir = tmp_path
|
|
335
|
+
parent_lf = pl.DataFrame(
|
|
336
|
+
{
|
|
337
|
+
"text": ["a", "b"],
|
|
338
|
+
"other": [1, 2],
|
|
339
|
+
"__derived__.tokens.text.jieba": [
|
|
340
|
+
[{"token": "a", "start": 0, "end": 1}],
|
|
341
|
+
[{"token": "b", "start": 0, "end": 1}],
|
|
342
|
+
],
|
|
343
|
+
}
|
|
344
|
+
).lazy()
|
|
345
|
+
meta = {
|
|
346
|
+
"source_column": "text",
|
|
347
|
+
"form": "tokens",
|
|
348
|
+
"model": "jieba",
|
|
349
|
+
"language": "zh",
|
|
350
|
+
"generated_at": "2026-05-12T00:00:00+00:00",
|
|
351
|
+
}
|
|
352
|
+
parent = workspace.add_node(
|
|
353
|
+
Node(
|
|
354
|
+
data=parent_lf,
|
|
355
|
+
name="parent",
|
|
356
|
+
workspace=workspace,
|
|
357
|
+
derived={"__derived__.tokens.text.jieba": meta}, # type: ignore[arg-type]
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Dropping an UNRELATED column does NOT cascade.
|
|
362
|
+
survivor = parent.drop("other")
|
|
363
|
+
assert "__derived__.tokens.text.jieba" in survivor.derived
|
|
364
|
+
assert "__derived__.tokens.text.jieba" in survivor.data.collect_schema().names()
|
|
365
|
+
|
|
366
|
+
# Dropping the SOURCE column cascades: the derived column disappears from
|
|
367
|
+
# both the LazyFrame schema and the metadata index.
|
|
368
|
+
cascaded = parent.drop("text")
|
|
369
|
+
after_names = cascaded.data.collect_schema().names()
|
|
370
|
+
assert "__derived__.tokens.text.jieba" not in after_names
|
|
371
|
+
assert "__derived__.tokens.text.jieba" not in cascaded.derived
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def test_node_rename_cascades_derived_columns(tmp_path: Path):
|
|
375
|
+
"""Decision 7: renaming a source column drops derived columns that
|
|
376
|
+
referenced it (they become stale; user can re-tokenise)."""
|
|
377
|
+
workspace = Workspace("derived_rename_cascade")
|
|
378
|
+
workspace.ws_root_dir = tmp_path
|
|
379
|
+
parent_lf = pl.DataFrame(
|
|
380
|
+
{
|
|
381
|
+
"text": ["a", "b"],
|
|
382
|
+
"__derived__.tokens.text.jieba": [
|
|
383
|
+
[{"token": "a", "start": 0, "end": 1}],
|
|
384
|
+
[{"token": "b", "start": 0, "end": 1}],
|
|
385
|
+
],
|
|
386
|
+
}
|
|
387
|
+
).lazy()
|
|
388
|
+
meta = {
|
|
389
|
+
"source_column": "text",
|
|
390
|
+
"form": "tokens",
|
|
391
|
+
"model": "jieba",
|
|
392
|
+
"language": "zh",
|
|
393
|
+
"generated_at": "2026-05-12T00:00:00+00:00",
|
|
394
|
+
}
|
|
395
|
+
node = workspace.add_node(
|
|
396
|
+
Node(
|
|
397
|
+
data=parent_lf,
|
|
398
|
+
name="rename_target",
|
|
399
|
+
workspace=workspace,
|
|
400
|
+
derived={"__derived__.tokens.text.jieba": meta}, # type: ignore[arg-type]
|
|
401
|
+
)
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
node.rename({"text": "body"})
|
|
405
|
+
after_names = node.data.collect_schema().names()
|
|
406
|
+
assert "body" in after_names
|
|
407
|
+
assert "text" not in after_names
|
|
408
|
+
assert "__derived__.tokens.text.jieba" not in after_names
|
|
409
|
+
assert node.derived == {}
|
|
@@ -431,6 +431,38 @@ class TestWorkspaceGraphOperations:
|
|
|
431
431
|
for field in required_fields:
|
|
432
432
|
assert field in node_data
|
|
433
433
|
|
|
434
|
+
def test_workspace_graph_survives_broken_node_info(self):
|
|
435
|
+
"""One node failing `info()` must not break the whole graph payload."""
|
|
436
|
+
workspace = Workspace("graph_resilience")
|
|
437
|
+
good_node = Node(
|
|
438
|
+
data=pl.DataFrame({"x": [1, 2, 3]}).lazy(),
|
|
439
|
+
name="good",
|
|
440
|
+
workspace=workspace,
|
|
441
|
+
)
|
|
442
|
+
bad_node = Node(
|
|
443
|
+
data=pl.DataFrame({"y": [4, 5]}).lazy(),
|
|
444
|
+
name="bad",
|
|
445
|
+
workspace=workspace,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# Simulate a broken lazy plan / missing source file: info() raises.
|
|
449
|
+
def _boom() -> dict:
|
|
450
|
+
raise RuntimeError("source parquet missing")
|
|
451
|
+
|
|
452
|
+
bad_node.info = _boom # type: ignore[method-assign]
|
|
453
|
+
|
|
454
|
+
graph_data = workspace.graph_json()
|
|
455
|
+
|
|
456
|
+
nodes_by_id = {n["id"]: n for n in graph_data["nodes"]}
|
|
457
|
+
assert good_node.id in nodes_by_id
|
|
458
|
+
assert bad_node.id in nodes_by_id
|
|
459
|
+
# Healthy node still carries its real info.
|
|
460
|
+
assert "shape" in nodes_by_id[good_node.id]
|
|
461
|
+
# Broken node carries an error envelope plus identity fields.
|
|
462
|
+
assert nodes_by_id[bad_node.id]["name"] == "bad"
|
|
463
|
+
assert "error" in nodes_by_id[bad_node.id]
|
|
464
|
+
assert "RuntimeError" in nodes_by_id[bad_node.id]["error"]
|
|
465
|
+
|
|
434
466
|
def test_workspace_with_initial_data_loading(self):
|
|
435
467
|
"""Test explicit initial data loading after creating an empty workspace."""
|
|
436
468
|
# Test with DataFrame converted to LazyFrame before creating a Node.
|
|
@@ -156,3 +156,71 @@ def test_rebase_then_rename_then_save_keeps_parquet(tmp_path: Path):
|
|
|
156
156
|
# Save (triggers GC) — the parquet must survive.
|
|
157
157
|
ws2.save(folder_b)
|
|
158
158
|
assert (folder_b / "data" / "my_data.parquet").exists()
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_rebase_preserves_tokenized_node_after_move(tmp_path: Path):
|
|
162
|
+
"""Phase 2.9 regression: a node with a List[Struct] tokens column must
|
|
163
|
+
survive workspace-folder move + rebase_workspace_sources. The rebasing
|
|
164
|
+
walks scan-source paths inside the plbin, not the dataframe schema, so
|
|
165
|
+
it should be schema-agnostic — this test locks that in."""
|
|
166
|
+
|
|
167
|
+
folder_a = tmp_path / "Tokens"
|
|
168
|
+
folder_a.mkdir()
|
|
169
|
+
data_dir = folder_a / "data"
|
|
170
|
+
data_dir.mkdir()
|
|
171
|
+
|
|
172
|
+
parquet_path = data_dir / "docs.parquet"
|
|
173
|
+
_make_parquet(parquet_path, pl.DataFrame({"text": ["doc one", "doc two"]}))
|
|
174
|
+
|
|
175
|
+
ws = Workspace(name="Tokens", ws_root_dir=folder_a)
|
|
176
|
+
base_node = Node(
|
|
177
|
+
data=pl.scan_parquet(parquet_path.resolve()),
|
|
178
|
+
name="docs",
|
|
179
|
+
)
|
|
180
|
+
ws.add_node(base_node)
|
|
181
|
+
|
|
182
|
+
# Synthesize a derived tokens column on top via with_columns (LazyFrame
|
|
183
|
+
# plan; represents what worker_tasks_tokenize will produce in Phase 2.3).
|
|
184
|
+
derived_name = "__derived__.tokens.text.jieba"
|
|
185
|
+
derived_meta = {
|
|
186
|
+
"source_column": "text",
|
|
187
|
+
"form": "tokens",
|
|
188
|
+
"model": "jieba",
|
|
189
|
+
"language": "zh",
|
|
190
|
+
"generated_at": "2026-05-12T00:00:00+00:00",
|
|
191
|
+
}
|
|
192
|
+
tokens_frame = base_node.data.with_columns(
|
|
193
|
+
pl.lit([{"token": "doc", "start": 0, "end": 3}, {"token": "one", "start": 4, "end": 7}])
|
|
194
|
+
.alias(derived_name)
|
|
195
|
+
)
|
|
196
|
+
tokens_node = Node(
|
|
197
|
+
data=tokens_frame,
|
|
198
|
+
name="docs_tokens",
|
|
199
|
+
parents=[base_node],
|
|
200
|
+
operation="tokenize",
|
|
201
|
+
derived={derived_name: derived_meta},
|
|
202
|
+
)
|
|
203
|
+
ws.add_node(tokens_node)
|
|
204
|
+
ws.save(folder_a)
|
|
205
|
+
|
|
206
|
+
# Move the workspace folder to a new location.
|
|
207
|
+
folder_b = tmp_path / "Tokens_Moved"
|
|
208
|
+
shutil.copytree(folder_a, folder_b)
|
|
209
|
+
shutil.rmtree(folder_a)
|
|
210
|
+
|
|
211
|
+
rebase_workspace_sources(folder_b)
|
|
212
|
+
ws2 = Workspace.load(folder_b)
|
|
213
|
+
|
|
214
|
+
# Both nodes should be back, and the tokens node's lineage + metadata
|
|
215
|
+
# preserved.
|
|
216
|
+
assert len(ws2.nodes) == 2
|
|
217
|
+
loaded_tokens_node = next(
|
|
218
|
+
n for n in ws2.nodes.values() if n.name == "docs_tokens"
|
|
219
|
+
)
|
|
220
|
+
assert loaded_tokens_node.derived == {derived_name: derived_meta}
|
|
221
|
+
assert loaded_tokens_node.operation == "tokenize"
|
|
222
|
+
|
|
223
|
+
# The List[Struct] column should still be loadable end-to-end.
|
|
224
|
+
collected = cast(pl.DataFrame, loaded_tokens_node.data.collect())
|
|
225
|
+
assert derived_name in collected.columns
|
|
226
|
+
assert collected.height == 2
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|