docworkspace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ """docworkspace public API exports after module split.
2
+
3
+ Provides backward compatibility for original imports while exposing
4
+ serialization, analysis, and graph helpers in dedicated submodules.
5
+ """
6
+
7
+ from .node import Node # package exposing Node
8
+ from .workspace import Workspace # shim -> workspace.core.Workspace
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = ["Workspace", "Node"]
@@ -0,0 +1,10 @@
1
+ """Public exports for node package.
2
+
3
+ Operations are now provided purely as instance methods on ``Node``:
4
+ node.filter(...), node.select(...), node.join(...)
5
+ The former functional helpers have been removed to reduce duplication.
6
+ """
7
+
8
+ from .core import Node
9
+
10
+ __all__ = ["Node"]
@@ -0,0 +1,424 @@
1
+ """Node core definition (split from former monolithic node.py).
2
+
3
+ Contains structural aspects: construction, parent/child tracking, schema helpers,
4
+ materialization, serialization (kept minimal for workspace persistence),
5
+ and core dataframe operations (join/filter/slice/dynamic delegation).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import uuid
11
+ from enum import Enum
12
+ from typing import Any, Dict, List, Literal, Optional
13
+
14
+ import polars as pl
15
+ from polars import DataFrame, LazyFrame
16
+
17
+ from docframe import DocDataFrame, DocLazyFrame # type: ignore # runtime import
18
+
19
+ if False: # TYPE_CHECKING replacement to avoid runtime import cycle
20
+ from ..workspace.core import Workspace # pragma: no cover
21
+
22
+ # Supported data types
23
+ SupportedDataTypes = DataFrame | LazyFrame | DocDataFrame | DocLazyFrame
24
+
25
+
26
+ class NodeDataType(str, Enum):
27
+ DataFrame = "DataFrame"
28
+ LazyFrame = "LazyFrame"
29
+ DocDataFrame = "DocDataFrame"
30
+ DocLazyFrame = "DocLazyFrame"
31
+
32
+
33
+ SerializableDataType = Literal[
34
+ "DataFrame",
35
+ "LazyFrame",
36
+ "DocDataFrame",
37
+ "DocLazyFrame",
38
+ ]
39
+
40
+
41
+ def extract_polars_data(data: SupportedDataTypes) -> pl.DataFrame | pl.LazyFrame:
42
+ """
43
+ Extract the underlying Polars DataFrame or LazyFrame from any supported data type.
44
+
45
+ This is needed for operations like join that require native Polars objects.
46
+ """
47
+ if isinstance(data, (pl.DataFrame, pl.LazyFrame)):
48
+ return data
49
+ elif isinstance(data, DocDataFrame):
50
+ return data.to_dataframe()
51
+ elif isinstance(data, DocLazyFrame):
52
+ return data.to_lazyframe()
53
+ else:
54
+ raise TypeError(f"Unsupported data type: {type(data)}")
55
+
56
+
57
+ class Node:
58
+ def __init__(
59
+ self,
60
+ data: SupportedDataTypes,
61
+ name: str | None = None,
62
+ workspace: Optional["Workspace"] = None,
63
+ parents: List["Node"] | None = None,
64
+ operation: str | None = None,
65
+ ) -> None:
66
+ from ..workspace.core import Workspace # local import to avoid cycle
67
+
68
+ self.id = str(uuid.uuid4())
69
+ self.name = name or f"node_{self.id[:8]}"
70
+
71
+ assert isinstance(data, SupportedDataTypes), (
72
+ f"Unsupported data type: {type(data).__name__}. Node supports pl.DataFrame, pl.LazyFrame, DocDataFrame, DocLazyFrame."
73
+ )
74
+
75
+ self.data = data
76
+ self.parents: list[Node] = parents or []
77
+ self.children: list[Node] = []
78
+
79
+ if workspace is None:
80
+ workspace = Workspace(name=f"workspace_for_{self.name}")
81
+ self.workspace: Workspace = workspace # type: ignore
82
+ self.operation = operation
83
+
84
+ if self.id not in self.workspace.nodes:
85
+ self.workspace.add_node(self)
86
+
87
+ for parent in self.parents:
88
+ parent.children.append(self)
89
+
90
+ # ------------------------------------------------------------------
91
+ # Delegation helpers
92
+ # ------------------------------------------------------------------
93
+ def _wrap_result(self, result: Any, op_name: str) -> Any:
94
+ """Wrap DataFrame-like results into a new Node preserving lineage.
95
+
96
+ Non-dataframe results (scalars, lists, etc.) are returned directly.
97
+ """
98
+ if isinstance(result, (pl.DataFrame, pl.LazyFrame, DocDataFrame, DocLazyFrame)):
99
+ return Node(
100
+ data=result,
101
+ name=f"{op_name}_{self.name}",
102
+ workspace=self.workspace,
103
+ parents=[self],
104
+ operation=op_name,
105
+ )
106
+ return result
107
+
108
+ def __getattr__(self, item: str) -> Any: # pragma: no cover - thin wrapper
109
+ # Delegate attribute access to underlying data object. If it's a
110
+ # callable returning a dataframe-like object we convert result to Node.
111
+ attr = getattr(self.data, item)
112
+ if callable(attr):
113
+
114
+ def wrapper(*args, **kwargs):
115
+ result = attr(*args, **kwargs)
116
+ return self._wrap_result(result, item)
117
+
118
+ return wrapper
119
+ return attr
120
+
121
+ # Commonly accessed convenience properties (explicit to avoid delegation surprises)
122
+ @property
123
+ def shape(self): # pragma: no cover - trivial delegation
124
+ return getattr(self.data, "shape", None)
125
+
126
+ @property
127
+ def columns(self): # pragma: no cover
128
+ return getattr(self.data, "columns", [])
129
+
130
+ # ------------------------------------------------------------------
131
+ # Explicit graph-producing dataframe operations
132
+ # ------------------------------------------------------------------
133
+ def filter(self, predicate: Any) -> "Node":
134
+ df = self.data
135
+ if hasattr(df, "filter"):
136
+ result = getattr(df, "filter")(predicate) # type: ignore[arg-type]
137
+ return Node(
138
+ data=result,
139
+ name=f"filter_{self.name}",
140
+ workspace=self.workspace,
141
+ parents=[self],
142
+ operation="filter",
143
+ )
144
+ raise AttributeError("Underlying data does not support filter")
145
+
146
+ def select(self, *columns: str) -> "Node":
147
+ df = self.data
148
+ if hasattr(df, "select"):
149
+ result = getattr(df, "select")(*columns)
150
+ return Node(
151
+ data=result,
152
+ name=f"select_{self.name}",
153
+ workspace=self.workspace,
154
+ parents=[self],
155
+ operation="select",
156
+ )
157
+ raise AttributeError("Underlying data does not support select")
158
+
159
+ def join(self, other: "Node", on: str | list[str], how: str = "inner") -> "Node":
160
+ # Extract underlying Polars data for both nodes
161
+ ldf = extract_polars_data(self.data)
162
+ rdf = extract_polars_data(other.data)
163
+
164
+ # Ensure both are the same type for join operation
165
+ # If one is lazy and other is not, convert the eager one to lazy
166
+ if isinstance(ldf, pl.LazyFrame) and isinstance(rdf, pl.DataFrame):
167
+ rdf = rdf.lazy()
168
+ elif isinstance(ldf, pl.DataFrame) and isinstance(rdf, pl.LazyFrame):
169
+ ldf = ldf.lazy()
170
+
171
+ if hasattr(ldf, "join"):
172
+ result = getattr(ldf, "join")(rdf, on=on, how=how) # type: ignore[arg-type]
173
+ return Node(
174
+ data=result,
175
+ name=f"join_{self.name}_{other.name}",
176
+ workspace=self.workspace,
177
+ parents=[self, other],
178
+ operation=f"join({how})",
179
+ )
180
+ raise AttributeError("Underlying data does not support join")
181
+
182
+ def slice(self, *args, **kwargs) -> "Node":
183
+ """Return a sliced Node.
184
+
185
+ Supports both slice objects and (offset, length) signatures similar to
186
+ polars. Examples:
187
+ node.slice(0, 10)
188
+ node.slice(slice(0, 10))
189
+ """
190
+ df = self.data
191
+ offset: int | None = None
192
+ length: int | None = None
193
+ if args and isinstance(args[0], slice):
194
+ sl: slice = args[0]
195
+ offset = 0 if sl.start is None else sl.start
196
+ if sl.stop is not None:
197
+ length = sl.stop - offset
198
+ elif args:
199
+ offset = args[0]
200
+ if len(args) > 1:
201
+ length = args[1]
202
+ else:
203
+ offset = 0
204
+ if not hasattr(df, "slice"):
205
+ raise AttributeError("Underlying data does not support slice operation")
206
+ # polars slice signature slice(offset, length=None)
207
+ result = getattr(df, "slice")(offset, length) # type: ignore[arg-type]
208
+ return Node(
209
+ data=result,
210
+ name=f"slice_{self.name}",
211
+ workspace=self.workspace,
212
+ parents=[self],
213
+ operation="slice",
214
+ )
215
+
216
+ # ------------------------------------------------------------------
217
+ # Properties
218
+ # ------------------------------------------------------------------
219
+ @property
220
+ def is_lazy(self) -> bool:
221
+ return isinstance(self.data, (pl.LazyFrame, DocLazyFrame))
222
+
223
+ @property
224
+ def document_column(self) -> Optional[str]:
225
+ if isinstance(self.data, (DocDataFrame, DocLazyFrame)):
226
+ return self.data.document_column
227
+ return None
228
+
229
+ # ------------------------------------------------------------------
230
+ # Schema / materialization utilities
231
+ # ------------------------------------------------------------------
232
+ def collect(self) -> "Node":
233
+ if (
234
+ self.is_lazy
235
+ and hasattr(self.data, "collect")
236
+ and callable(self.data.collect)
237
+ ):
238
+ try:
239
+ collected = self.data.collect()
240
+ new_node = Node(
241
+ data=collected,
242
+ name=f"collect_{self.name}",
243
+ workspace=self.workspace,
244
+ parents=[self],
245
+ operation=f"collect({self.name})",
246
+ )
247
+ self.workspace.add_node(new_node)
248
+ return new_node
249
+ except Exception:
250
+ return self
251
+ return self
252
+
253
+ def materialize(self) -> "Node":
254
+ if (
255
+ self.is_lazy
256
+ and hasattr(self.data, "collect")
257
+ and callable(self.data.collect)
258
+ ):
259
+ try:
260
+ self.data = self.data.collect()
261
+ except Exception:
262
+ pass
263
+ return self
264
+
265
+ def json_schema(self) -> Dict[str, str]:
266
+ """Return raw schema - JSON conversion should be handled by API layer."""
267
+ try:
268
+ schema = self.data.collect_schema() if self.is_lazy else self.data.schema
269
+ # Return raw schema as dict for API layer to convert
270
+ return {col: str(dtype) for col, dtype in schema.items()} if schema else {}
271
+ except Exception:
272
+ return {}
273
+
274
+ # ------------------------------------------------------------------
275
+ # Info / serialization (minimal)
276
+ # ------------------------------------------------------------------
277
+ def info(self) -> Dict[str, Any]:
278
+ """Get node information with raw schema data.
279
+
280
+ Returns raw Polars schema - JSON type conversion should be handled
281
+ by the API layer, not in the core docworkspace library.
282
+ """
283
+ dtype = type(self.data)
284
+ info_dict: Dict[str, Any] = {
285
+ "id": self.id,
286
+ "name": self.name,
287
+ "dtype": dtype, # Return actual type object - API layer will convert to string
288
+ "lazy": self.is_lazy,
289
+ "operation": self.operation,
290
+ "parent_ids": [p.id for p in self.parents],
291
+ "child_ids": [c.id for c in self.children],
292
+ }
293
+ if isinstance(self.data, (pl.DataFrame, DocDataFrame)):
294
+ info_dict["shape"] = getattr(self.data, "shape", (0, 0))
295
+ elif isinstance(self.data, (pl.LazyFrame, DocLazyFrame)):
296
+ lf = (
297
+ self.data.lazyframe
298
+ if isinstance(self.data, DocLazyFrame)
299
+ else self.data
300
+ )
301
+ try:
302
+ height = lf.select(pl.len()).collect().item()
303
+ width = len(lf.collect_schema().names())
304
+ info_dict["shape"] = (height, width)
305
+ except Exception:
306
+ info_dict["shape"] = (0, 0)
307
+ schema = None
308
+ try:
309
+ schema = self.data.collect_schema() if self.is_lazy else self.data.schema
310
+ except Exception:
311
+ pass
312
+ if schema is not None:
313
+ # Always return raw schema - API layer will convert to JS types
314
+ info_dict["schema"] = schema
315
+ else:
316
+ info_dict["schema"] = {}
317
+ if isinstance(self.data, (DocDataFrame, DocLazyFrame)):
318
+ info_dict["document_column"] = self.document_column
319
+ return info_dict
320
+
321
+ def _normalized_type(self) -> SerializableDataType:
322
+ if isinstance(self.data, DocDataFrame):
323
+ return "DocDataFrame"
324
+ if isinstance(self.data, DocLazyFrame):
325
+ return "DocLazyFrame"
326
+ if isinstance(self.data, pl.LazyFrame):
327
+ return "LazyFrame"
328
+ return "DataFrame"
329
+
330
+ def serialize(self, format: str = "json") -> Dict[str, Any]:
331
+ if format != "json":
332
+ raise ValueError(f"Unsupported format: {format}")
333
+ normalized = self._normalized_type()
334
+
335
+ # Suppress the deprecation warning for LazyFrame serialization
336
+ # This is mainly used for testing and persistence
337
+ import warnings
338
+
339
+ with warnings.catch_warnings():
340
+ warnings.simplefilter("ignore", UserWarning)
341
+ serialized_data = self.data.serialize(format="json")
342
+ data_metadata = {"type": normalized}
343
+ return {
344
+ "node_metadata": {
345
+ "id": self.id,
346
+ "name": self.name,
347
+ "operation": self.operation,
348
+ "data_type": normalized,
349
+ "is_lazy": normalized in ("LazyFrame", "DocLazyFrame"),
350
+ },
351
+ "data_metadata": data_metadata,
352
+ "serialized_data": serialized_data,
353
+ }
354
+
355
+ @classmethod
356
+ def deserialize(
357
+ cls,
358
+ serialized_node: Dict[str, Any],
359
+ workspace: "Workspace",
360
+ format: str = "json",
361
+ ) -> "Node":
362
+ import polars as pl
363
+
364
+ from docframe import DocDataFrame, DocLazyFrame
365
+
366
+ if format != "json":
367
+ raise ValueError(f"Unsupported format: {format}")
368
+
369
+ node_meta = serialized_node["node_metadata"]
370
+ data_meta = serialized_node["data_metadata"]
371
+ data_blob = serialized_node["serialized_data"]
372
+ data_type = data_meta["type"]
373
+
374
+ # Polars/DocFrame .serialize(format="json") returns a JSON string (or array-string)
375
+ # that DataFrame.deserialize expects as a file path *unless* provided a file-like.
376
+ # The previous implementation passed the raw string causing it to be interpreted
377
+ # as a (very long) file path, triggering OSError: File name too long.
378
+ # We detect non-path strings and wrap them in StringIO so Polars treats them as
379
+ # file-like objects containing the serialized payload.
380
+ from io import StringIO
381
+ from pathlib import Path as _P
382
+
383
+ def _wrap(blob: Any): # type: ignore[override]
384
+ if isinstance(blob, str):
385
+ try:
386
+ p = _P(blob)
387
+ # Treat as real path only if it exists on disk and is reasonably short
388
+ if p.exists():
389
+ return blob
390
+ except Exception: # pragma: no cover - path edge cases
391
+ pass
392
+ return StringIO(blob)
393
+ return blob
394
+
395
+ if data_type == "DocDataFrame":
396
+ data = DocDataFrame.deserialize(_wrap(data_blob), format="json")
397
+ elif data_type == "DocLazyFrame":
398
+ data = DocLazyFrame.deserialize(_wrap(data_blob), format="json")
399
+ elif data_type == "DataFrame":
400
+ data = pl.DataFrame.deserialize(_wrap(data_blob), format="json")
401
+ elif data_type == "LazyFrame":
402
+ data = pl.LazyFrame.deserialize(_wrap(data_blob), format="json")
403
+ else:
404
+ raise ValueError(f"Unknown data type: {data_meta['type']}")
405
+ node = cls.__new__(cls)
406
+ node.id = node_meta["id"]
407
+ node.name = node_meta["name"]
408
+ node.data = data
409
+ node.parents = []
410
+ node.children = []
411
+ node.workspace = workspace
412
+ node.operation = node_meta["operation"]
413
+ workspace.nodes[node.id] = node
414
+ return node
415
+
416
+ # Representation --------------------------------------------------
417
+ def __repr__(self) -> str: # pragma: no cover
418
+ return (
419
+ f"Node(id={self.id[:8]}, name='{self.name}', dtype={type(self.data).__name__}, "
420
+ f"lazy={self.is_lazy}, parents={len(self.parents)}, children={len(self.children)})"
421
+ )
422
+
423
+
424
+ __all__ = ["Node"]
@@ -0,0 +1,21 @@
1
+ """Workspace subpackage public exports.
2
+
3
+ Exports the :class:`Workspace` core class plus helper free functions for
4
+ serialization, analysis, and graph views so internal relative imports
5
+ (`from .workspace import Workspace`) keep working after the split.
6
+ """
7
+
8
+ from .analysis import info, summary # noqa: F401
9
+ from .core import Workspace # noqa: F401
10
+ from .graph_views import graph, visualize_graph # noqa: F401
11
+ from .io import deserialize_workspace, serialize_workspace # noqa: F401
12
+
13
+ __all__ = [
14
+ "Workspace",
15
+ "serialize_workspace",
16
+ "deserialize_workspace",
17
+ "summary",
18
+ "info",
19
+ "graph",
20
+ "visualize_graph",
21
+ ]
@@ -0,0 +1,45 @@
1
+ """Analysis & summary helpers split from monolithic workspace.py.
2
+
3
+ `summary` now returns richer information required by existing tests and API:
4
+ - total_nodes, root_nodes, leaf_nodes
5
+ - node_types counts, status_counts (lazy/eager)
6
+ - metadata_keys from workspace metadata
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import TYPE_CHECKING, Any, Dict
12
+
13
+ if TYPE_CHECKING: # pragma: no cover
14
+ from .core import Workspace
15
+
16
+
17
+ def summary(workspace: "Workspace", json: bool = False) -> Dict[str, Any]:
18
+ total_nodes = len(workspace.nodes)
19
+ root_nodes = len(workspace.get_root_nodes())
20
+ leaf_nodes = len(workspace.get_leaf_nodes())
21
+ node_types: Dict[str, int] = {}
22
+ lazy_count = 0
23
+ for node in workspace.nodes.values():
24
+ t = type(node.data).__name__
25
+ node_types[t] = node_types.get(t, 0) + 1
26
+ if node.is_lazy:
27
+ lazy_count += 1
28
+ status_counts = {"lazy": lazy_count, "eager": total_nodes - lazy_count}
29
+ return {
30
+ "workspace": workspace.name,
31
+ "workspace_id": workspace.id,
32
+ "total_nodes": total_nodes,
33
+ "root_nodes": root_nodes,
34
+ "leaf_nodes": leaf_nodes,
35
+ "node_types": node_types,
36
+ "status_counts": status_counts,
37
+ "metadata_keys": list(workspace._metadata.keys()), # type: ignore[attr-defined]
38
+ }
39
+
40
+
41
+ def info(workspace: "Workspace", json: bool = False) -> Dict[str, Any]:
42
+ return summary(workspace, json=json)
43
+
44
+
45
+ __all__ = ["summary", "info"]