docworkspace 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docworkspace-0.1.0/.gitignore +57 -0
- docworkspace-0.1.0/PKG-INFO +584 -0
- docworkspace-0.1.0/README.md +573 -0
- docworkspace-0.1.0/examples/data/ADO/candidate_info_gender.csv +134 -0
- docworkspace-0.1.0/examples/data/ADO/qldelection2020_candidate_tweets.csv +6048 -0
- docworkspace-0.1.0/examples/data/Hansard/economy_agenda.csv +5873 -0
- docworkspace-0.1.0/examples/data/Hansard/housing_agenda.csv +2281 -0
- docworkspace-0.1.0/pyproject.toml +19 -0
- docworkspace-0.1.0/pytest.ini +10 -0
- docworkspace-0.1.0/src/docworkspace/__init__.py +11 -0
- docworkspace-0.1.0/src/docworkspace/node/__init__.py +10 -0
- docworkspace-0.1.0/src/docworkspace/node/core.py +424 -0
- docworkspace-0.1.0/src/docworkspace/workspace/__init__.py +21 -0
- docworkspace-0.1.0/src/docworkspace/workspace/analysis.py +45 -0
- docworkspace-0.1.0/src/docworkspace/workspace/core.py +254 -0
- docworkspace-0.1.0/src/docworkspace/workspace/graph_views.py +57 -0
- docworkspace-0.1.0/src/docworkspace/workspace/io.py +84 -0
- docworkspace-0.1.0/tests/conftest.py +69 -0
- docworkspace-0.1.0/tests/test_fastapi_integration.py +91 -0
- docworkspace-0.1.0/tests/test_node.py +263 -0
- docworkspace-0.1.0/tests/test_simple_operations.py +89 -0
- docworkspace-0.1.0/tests/test_workspace.py +556 -0
- docworkspace-0.1.0/tests/test_workspace_serialization_types.py +82 -0
- docworkspace-0.1.0/tests/test_workspace_shim.py +14 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
venv/
|
|
12
|
+
ENV/
|
|
13
|
+
env/
|
|
14
|
+
.env
|
|
15
|
+
|
|
16
|
+
# uv
|
|
17
|
+
.python-version
|
|
18
|
+
uv.lock
|
|
19
|
+
|
|
20
|
+
# Pytest
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
htmlcov/
|
|
24
|
+
|
|
25
|
+
# IDEs and editors
|
|
26
|
+
.vscode/
|
|
27
|
+
.idea/
|
|
28
|
+
*.swp
|
|
29
|
+
*.swo
|
|
30
|
+
*~
|
|
31
|
+
|
|
32
|
+
# Operating System
|
|
33
|
+
.DS_Store
|
|
34
|
+
.DS_Store?
|
|
35
|
+
._*
|
|
36
|
+
|
|
37
|
+
# Temporary files
|
|
38
|
+
*.tmp
|
|
39
|
+
*.temp
|
|
40
|
+
*.bak
|
|
41
|
+
*.backup
|
|
42
|
+
|
|
43
|
+
# MyPy
|
|
44
|
+
.mypy_cache/
|
|
45
|
+
.dmypy.json
|
|
46
|
+
dmypy.json
|
|
47
|
+
|
|
48
|
+
# Testing
|
|
49
|
+
.tox/
|
|
50
|
+
.nox/
|
|
51
|
+
.hypothesis/
|
|
52
|
+
|
|
53
|
+
# Machine learning models
|
|
54
|
+
*.pkl
|
|
55
|
+
*.pickle
|
|
56
|
+
*.joblib
|
|
57
|
+
*.model
|
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docworkspace
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A workspace library for managing DocDataFrames and DataFrames with parent-child relationships and lazy evaluation
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: docframe
|
|
7
|
+
Requires-Dist: typing-extensions
|
|
8
|
+
Provides-Extra: cpu
|
|
9
|
+
Requires-Dist: docframe[cpu]; extra == 'cpu'
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# DocWorkspace
|
|
13
|
+
|
|
14
|
+
A powerful Python library for managing DataFrames and DocDataFrames with parent-child relationships, lazy evaluation, and FastAPI integration. Part of the LDaCA (Language Data Commons of Australia) ecosystem.
|
|
15
|
+
|
|
16
|
+
## Overview
|
|
17
|
+
|
|
18
|
+
DocWorkspace provides a workspace-based approach to data analysis, where data transformations are tracked as nodes in a directed graph. This enables:
|
|
19
|
+
|
|
20
|
+
- **Relationship Tracking**: Understand data lineage and transformation history
|
|
21
|
+
- **Lazy Evaluation**: Optimize performance with Polars LazyFrames
|
|
22
|
+
- **Multiple Data Types**: Support for Polars DataFrames, LazyFrames, DocDataFrames, and DocLazyFrames
|
|
23
|
+
- **FastAPI Integration**: Ready-to-use models and utilities for web APIs
|
|
24
|
+
- **Serialization**: Save and restore entire workspaces with their relationships
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install docworkspace
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Dependencies
|
|
33
|
+
|
|
34
|
+
- Python ≥ 3.12
|
|
35
|
+
- polars ≥ 0.20.0
|
|
36
|
+
- docframe
|
|
37
|
+
- pandas ≥ 2.0.0
|
|
38
|
+
- typing-extensions
|
|
39
|
+
|
|
40
|
+
For FastAPI integration:
|
|
41
|
+
```bash
|
|
42
|
+
pip install pydantic
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import polars as pl
|
|
49
|
+
from docworkspace import Node, Workspace
|
|
50
|
+
from docframe import DocDataFrame
|
|
51
|
+
|
|
52
|
+
# Create a workspace
|
|
53
|
+
workspace = Workspace("my_analysis")
|
|
54
|
+
|
|
55
|
+
# Load data
|
|
56
|
+
df = pl.DataFrame({
|
|
57
|
+
"text": ["Hello world", "Data science", "Python rocks"],
|
|
58
|
+
"category": ["greeting", "tech", "programming"],
|
|
59
|
+
"score": [0.8, 0.9, 0.95]
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
# Add data to workspace
|
|
63
|
+
data_node = workspace.add_node(Node(df, name="raw_data"))
|
|
64
|
+
|
|
65
|
+
# Apply transformations (creates new nodes automatically)
|
|
66
|
+
filtered = data_node.filter(pl.col("score") > 0.85)
|
|
67
|
+
grouped = filtered.group_by("category").agg(pl.col("score").mean())
|
|
68
|
+
|
|
69
|
+
# Check relationships
|
|
70
|
+
print(f"Total nodes: {len(workspace.nodes)}")
|
|
71
|
+
print(f"Root nodes: {len(workspace.get_root_nodes())}")
|
|
72
|
+
print(f"Leaf nodes: {len(workspace.get_leaf_nodes())}")
|
|
73
|
+
|
|
74
|
+
# Visualize the computation graph
|
|
75
|
+
print(workspace.visualize_graph())
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Core Concepts
|
|
79
|
+
|
|
80
|
+
### Node
|
|
81
|
+
|
|
82
|
+
A `Node` wraps your data (DataFrames, LazyFrames, DocDataFrames) and tracks relationships with other nodes. Nodes support:
|
|
83
|
+
|
|
84
|
+
- **Transparent Data Access**: All DataFrame methods work directly on nodes
|
|
85
|
+
- **Automatic Relationship Tracking**: Operations create child nodes
|
|
86
|
+
- **Lazy Evaluation**: Maintains laziness for performance
|
|
87
|
+
- **Metadata**: Store operation descriptions and custom metadata
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
# Node automatically creates workspace if none provided
|
|
91
|
+
node = Node(df, name="my_data")
|
|
92
|
+
|
|
93
|
+
# All DataFrame operations work directly
|
|
94
|
+
filtered_node = node.filter(pl.col("value") > 10)
|
|
95
|
+
sorted_node = filtered_node.sort("value", descending=True)
|
|
96
|
+
|
|
97
|
+
# Check relationships
|
|
98
|
+
print(f"Children of original node: {len(node.children)}")
|
|
99
|
+
print(f"Parents of sorted node: {len(sorted_node.parents)}")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Workspace
|
|
103
|
+
|
|
104
|
+
A `Workspace` manages collections of nodes and provides graph operations:
|
|
105
|
+
|
|
106
|
+
- **Node Management**: Add, remove, and retrieve nodes
|
|
107
|
+
- **Graph Operations**: Find roots, leaves, descendants, ancestors
|
|
108
|
+
- **Serialization**: Save/load entire workspaces
|
|
109
|
+
- **Visualization**: Generate text-based and programmatic graph representations
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
workspace = Workspace("analysis")
|
|
113
|
+
|
|
114
|
+
# Add nodes
|
|
115
|
+
node1 = workspace.add_node(Node(df1, "dataset1"))
|
|
116
|
+
node2 = workspace.add_node(Node(df2, "dataset2"))
|
|
117
|
+
|
|
118
|
+
# Join creates a new node with both parents
|
|
119
|
+
joined = node1.join(node2, on="id")
|
|
120
|
+
|
|
121
|
+
# Explore the graph
|
|
122
|
+
roots = workspace.get_root_nodes()
|
|
123
|
+
leaves = workspace.get_leaf_nodes()
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Supported Data Types
|
|
127
|
+
|
|
128
|
+
DocWorkspace supports multiple data types from the Polars and DocFrame ecosystems:
|
|
129
|
+
|
|
130
|
+
### Polars Types
|
|
131
|
+
- **`pl.DataFrame`**: Materialized, in-memory data
|
|
132
|
+
- **`pl.LazyFrame`**: Lazy evaluation for performance optimization
|
|
133
|
+
|
|
134
|
+
### DocFrame Types
|
|
135
|
+
- **`DocDataFrame`**: Enhanced DataFrame for text analysis with document tracking
|
|
136
|
+
- **`DocLazyFrame`**: Lazy version of DocDataFrame
|
|
137
|
+
|
|
138
|
+
### Example with Different Types
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
import polars as pl
|
|
142
|
+
from docframe import DocDataFrame, DocLazyFrame
|
|
143
|
+
|
|
144
|
+
# Polars DataFrame (eager)
|
|
145
|
+
df = pl.DataFrame({"text": ["hello", "world"], "id": [1, 2]})
|
|
146
|
+
node1 = Node(df, "eager_data")
|
|
147
|
+
|
|
148
|
+
# Polars LazyFrame (lazy)
|
|
149
|
+
lazy_df = pl.LazyFrame({"text": ["foo", "bar"], "id": [3, 4]})
|
|
150
|
+
node2 = Node(lazy_df, "lazy_data")
|
|
151
|
+
|
|
152
|
+
# DocDataFrame (eager, with document column)
|
|
153
|
+
doc_df = DocDataFrame(df, document_column="text")
|
|
154
|
+
node3 = Node(doc_df, "doc_data")
|
|
155
|
+
|
|
156
|
+
# DocLazyFrame (lazy, with document column)
|
|
157
|
+
doc_lazy = DocLazyFrame(lazy_df, document_column="text")
|
|
158
|
+
node4 = Node(doc_lazy, "doc_lazy_data")
|
|
159
|
+
|
|
160
|
+
# All work seamlessly in the same workspace
|
|
161
|
+
workspace = Workspace("mixed_types")
|
|
162
|
+
for node in [node1, node2, node3, node4]:
|
|
163
|
+
workspace.add_node(node)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Key Features
|
|
167
|
+
|
|
168
|
+
### 1. Lazy Evaluation
|
|
169
|
+
|
|
170
|
+
DocWorkspace preserves Polars' lazy evaluation capabilities:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
# Start with lazy data
|
|
174
|
+
lazy_df = pl.scan_csv("large_file.csv")
|
|
175
|
+
node = Node(lazy_df, "raw_data")
|
|
176
|
+
|
|
177
|
+
# Chain operations (all remain lazy)
|
|
178
|
+
filtered = node.filter(pl.col("value") > 100)
|
|
179
|
+
grouped = filtered.group_by("category").agg(pl.col("value").sum())
|
|
180
|
+
sorted_result = grouped.sort("value", descending=True)
|
|
181
|
+
|
|
182
|
+
# Only materialize when needed
|
|
183
|
+
final_result = sorted_result.collect() # This creates a new materialized node
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### 2. Relationship Tracking
|
|
187
|
+
|
|
188
|
+
Understand your data lineage:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
# Create a processing pipeline
|
|
192
|
+
raw_data = Node(df, "raw")
|
|
193
|
+
cleaned = raw_data.filter(pl.col("value").is_not_null())
|
|
194
|
+
normalized = cleaned.with_columns(pl.col("value") / pl.col("value").max())
|
|
195
|
+
final = normalized.select(["id", "normalized_value"])
|
|
196
|
+
|
|
197
|
+
# Explore relationships
|
|
198
|
+
print("Processing chain:")
|
|
199
|
+
current = final
|
|
200
|
+
while current.parents:
|
|
201
|
+
parent = current.parents[0]
|
|
202
|
+
print(f"{parent.name} -> {current.name} ({current.operation})")
|
|
203
|
+
current = parent
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### 3. FastAPI Integration
|
|
207
|
+
|
|
208
|
+
Ready-to-use models for web APIs:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from docworkspace import FastAPIUtils, WorkspaceGraph, NodeSummary
|
|
212
|
+
|
|
213
|
+
# Convert workspace to FastAPI-compatible format
|
|
214
|
+
graph_data = workspace.to_api_graph()
|
|
215
|
+
|
|
216
|
+
# Get node summaries
|
|
217
|
+
summaries = [FastAPIUtils.node_to_summary(node) for node in workspace.nodes.values()]
|
|
218
|
+
|
|
219
|
+
# Get paginated data
|
|
220
|
+
paginated = FastAPIUtils.get_paginated_data(node, page=1, page_size=100)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### 4. Serialization
|
|
224
|
+
|
|
225
|
+
Save and restore complete workspaces:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
# Save workspace with all nodes and relationships
|
|
229
|
+
workspace.serialize("my_workspace.json")
|
|
230
|
+
|
|
231
|
+
# Load workspace later
|
|
232
|
+
restored_workspace = Workspace.deserialize("my_workspace.json")
|
|
233
|
+
|
|
234
|
+
# All nodes and relationships are preserved
|
|
235
|
+
assert len(restored_workspace.nodes) == len(workspace.nodes)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Advanced Usage
|
|
239
|
+
|
|
240
|
+
### Custom Operations
|
|
241
|
+
|
|
242
|
+
Create custom operations that maintain relationships:
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
def custom_transform(node: Node, operation_name: str) -> Node:
|
|
246
|
+
"""Apply custom transformation and track the operation."""
|
|
247
|
+
# Your custom logic here
|
|
248
|
+
result_data = node.data.with_columns(pl.col("value") * 2)
|
|
249
|
+
|
|
250
|
+
# Create new node with relationship tracking
|
|
251
|
+
return Node(
|
|
252
|
+
data=result_data,
|
|
253
|
+
name=f"{operation_name}_{node.name}",
|
|
254
|
+
workspace=node.workspace,
|
|
255
|
+
parents=[node],
|
|
256
|
+
operation=operation_name
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Use custom operation
|
|
260
|
+
transformed = custom_transform(original_node, "double_values")
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Graph Analysis
|
|
264
|
+
|
|
265
|
+
Analyze your computation graph:
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
# Find all descendants of a node
|
|
269
|
+
descendants = workspace.get_descendants(node.id)
|
|
270
|
+
|
|
271
|
+
# Find all ancestors
|
|
272
|
+
ancestors = workspace.get_ancestors(node.id)
|
|
273
|
+
|
|
274
|
+
# Get topological ordering
|
|
275
|
+
ordered_nodes = workspace.get_topological_order()
|
|
276
|
+
|
|
277
|
+
# Check for cycles (shouldn't happen in normal usage)
|
|
278
|
+
has_cycles = workspace.has_cycles()
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Working with DocDataFrames
|
|
282
|
+
|
|
283
|
+
Enhanced text analysis capabilities:
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from docframe import DocDataFrame
|
|
287
|
+
|
|
288
|
+
# Create DocDataFrame with document column
|
|
289
|
+
df = pl.DataFrame({
|
|
290
|
+
"doc_id": ["d1", "d2", "d3"],
|
|
291
|
+
"text": ["Hello world", "Data science", "Python rocks"],
|
|
292
|
+
"metadata": ["type1", "type2", "type1"]
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
doc_df = DocDataFrame(df, document_column="text")
|
|
296
|
+
node = Node(doc_df, "corpus")
|
|
297
|
+
|
|
298
|
+
# DocDataFrame operations work seamlessly
|
|
299
|
+
filtered = node.filter(pl.col("metadata") == "type1")
|
|
300
|
+
print(f"Document column preserved: {filtered.data.document_column}")
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## API Reference
|
|
304
|
+
|
|
305
|
+
### Node Class
|
|
306
|
+
|
|
307
|
+
#### Constructor
|
|
308
|
+
```python
|
|
309
|
+
Node(data, name=None, workspace=None, parents=None, operation=None)
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
#### Properties
|
|
313
|
+
- `is_lazy: bool` - Whether the underlying data is lazy
|
|
314
|
+
- `document_column: Optional[str]` - Document column for DocDataFrames
|
|
315
|
+
|
|
316
|
+
#### Methods
|
|
317
|
+
- `collect() -> Node` - Materialize lazy data (creates new node)
|
|
318
|
+
- `materialize() -> Node` - Alias for collect()
|
|
319
|
+
- `info(json=False) -> Dict` - Get node information
|
|
320
|
+
- `json_schema() -> Dict[str, str]` - Get JSON-compatible schema
|
|
321
|
+
|
|
322
|
+
#### DataFrame Operations
|
|
323
|
+
All Polars DataFrame/LazyFrame operations are available directly:
|
|
324
|
+
- `filter(condition) -> Node`
|
|
325
|
+
- `select(columns) -> Node`
|
|
326
|
+
- `with_columns(*exprs) -> Node`
|
|
327
|
+
- `group_by(*columns) -> Node`
|
|
328
|
+
- `sort(by, descending=False) -> Node`
|
|
329
|
+
- `join(other, on, how="inner") -> Node`
|
|
330
|
+
- And many more...
|
|
331
|
+
|
|
332
|
+
### Workspace Class
|
|
333
|
+
|
|
334
|
+
#### Constructor
|
|
335
|
+
```python
|
|
336
|
+
Workspace(name=None, data=None, data_name=None, csv_lazy=True, **csv_kwargs)
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
#### Properties
|
|
340
|
+
- `id: str` - Unique workspace identifier
|
|
341
|
+
- `name: str` - Human-readable name
|
|
342
|
+
- `nodes: Dict[str, Node]` - All nodes in the workspace
|
|
343
|
+
|
|
344
|
+
#### Methods
|
|
345
|
+
|
|
346
|
+
##### Node Management
|
|
347
|
+
- `add_node(node) -> Node` - Add a node to the workspace
|
|
348
|
+
- `remove_node(node_id, materialize_children=False) -> bool` - Remove a node
|
|
349
|
+
- `get_node(node_id) -> Optional[Node]` - Get node by ID
|
|
350
|
+
- `get_node_by_name(name) -> Optional[Node]` - Get node by name
|
|
351
|
+
- `list_nodes() -> List[Node]` - Get all nodes
|
|
352
|
+
|
|
353
|
+
##### Graph Operations
|
|
354
|
+
- `get_root_nodes() -> List[Node]` - Nodes with no parents
|
|
355
|
+
- `get_leaf_nodes() -> List[Node]` - Nodes with no children
|
|
356
|
+
- `get_descendants(node_id) -> List[Node]` - All descendant nodes
|
|
357
|
+
- `get_ancestors(node_id) -> List[Node]` - All ancestor nodes
|
|
358
|
+
- `get_topological_order() -> List[Node]` - Topologically sorted nodes
|
|
359
|
+
|
|
360
|
+
##### Visualization
|
|
361
|
+
- `visualize_graph() -> str` - Text-based graph visualization
|
|
362
|
+
- `graph() -> Dict` - Generic graph structure
|
|
363
|
+
- `to_react_flow_json() -> Dict` - React Flow compatible format
|
|
364
|
+
|
|
365
|
+
##### Serialization
|
|
366
|
+
- `serialize(file_path)` - Save workspace to JSON
|
|
367
|
+
- `deserialize(file_path) -> Workspace` - Load workspace from JSON
|
|
368
|
+
- `from_dict(workspace_dict) -> Workspace` - Create from dictionary
|
|
369
|
+
|
|
370
|
+
##### Metadata
|
|
371
|
+
- `get_metadata(key) -> Any` - Get workspace metadata
|
|
372
|
+
- `set_metadata(key, value)` - Set workspace metadata
|
|
373
|
+
- `summary() -> Dict` - Get workspace summary
|
|
374
|
+
- `info() -> Dict` - Alias for summary()
|
|
375
|
+
|
|
376
|
+
### FastAPI Integration
|
|
377
|
+
|
|
378
|
+
#### Models
|
|
379
|
+
- `NodeSummary` - API-friendly node representation
|
|
380
|
+
- `WorkspaceGraph` - React Flow compatible graph
|
|
381
|
+
- `PaginatedData` - Paginated data response
|
|
382
|
+
- `ErrorResponse` - Standard error format
|
|
383
|
+
- `OperationResult` - Operation result wrapper
|
|
384
|
+
|
|
385
|
+
#### Utilities
|
|
386
|
+
```python
|
|
387
|
+
FastAPIUtils.node_to_summary(node) -> NodeSummary
|
|
388
|
+
FastAPIUtils.get_paginated_data(node, page=1, page_size=100) -> PaginatedData
|
|
389
|
+
FastAPIUtils.workspace_to_react_flow(workspace) -> WorkspaceGraph
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
## Examples
|
|
393
|
+
|
|
394
|
+
### Example 1: Text Analysis Pipeline
|
|
395
|
+
|
|
396
|
+
```python
|
|
397
|
+
import polars as pl
|
|
398
|
+
from docworkspace import Node, Workspace
|
|
399
|
+
from docframe import DocDataFrame
|
|
400
|
+
|
|
401
|
+
# Sample text data
|
|
402
|
+
df = pl.DataFrame({
|
|
403
|
+
"doc_id": [f"doc_{i}" for i in range(100)],
|
|
404
|
+
"text": [f"Sample text content {i}" for i in range(100)],
|
|
405
|
+
"category": ["news", "blog", "academic"] * 34,
|
|
406
|
+
"year": [2020, 2021, 2022, 2023] * 25
|
|
407
|
+
})
|
|
408
|
+
|
|
409
|
+
# Create workspace
|
|
410
|
+
workspace = Workspace("text_analysis")
|
|
411
|
+
|
|
412
|
+
# Load as DocDataFrame for text analysis
|
|
413
|
+
doc_df = DocDataFrame(df, document_column="text")
|
|
414
|
+
corpus = workspace.add_node(Node(doc_df, "full_corpus"))
|
|
415
|
+
|
|
416
|
+
# Filter by category
|
|
417
|
+
news_docs = corpus.filter(pl.col("category") == "news")
|
|
418
|
+
blog_docs = corpus.filter(pl.col("category") == "blog")
|
|
419
|
+
|
|
420
|
+
# Filter by recent years
|
|
421
|
+
recent_news = news_docs.filter(pl.col("year") >= 2022)
|
|
422
|
+
|
|
423
|
+
# Group analysis
|
|
424
|
+
year_stats = corpus.group_by(["category", "year"]).agg(
|
|
425
|
+
pl.count().alias("doc_count")
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Materialize results
|
|
429
|
+
final_stats = year_stats.collect()
|
|
430
|
+
|
|
431
|
+
# Analyze the computation graph
|
|
432
|
+
print(workspace.visualize_graph())
|
|
433
|
+
print(f"Total transformations: {len(workspace.nodes)}")
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
### Example 2: Lazy Data Processing
|
|
437
|
+
|
|
438
|
+
```python
|
|
439
|
+
import polars as pl
|
|
440
|
+
from docworkspace import Workspace
|
|
441
|
+
|
|
442
|
+
# Create workspace with lazy CSV loading
|
|
443
|
+
workspace = Workspace(
|
|
444
|
+
"large_data_analysis",
|
|
445
|
+
data="large_dataset.csv", # Path to CSV
|
|
446
|
+
data_name="raw_data",
|
|
447
|
+
csv_lazy=True # Load as LazyFrame for performance
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Get the loaded node
|
|
451
|
+
raw_data = workspace.get_node_by_name("raw_data")
|
|
452
|
+
print(f"Is lazy: {raw_data.is_lazy}") # True
|
|
453
|
+
|
|
454
|
+
# Chain transformations (all remain lazy)
|
|
455
|
+
cleaned = raw_data.filter(pl.col("value").is_not_null())
|
|
456
|
+
normalized = cleaned.with_columns(
|
|
457
|
+
(pl.col("value") / pl.col("value").max()).alias("normalized")
|
|
458
|
+
)
|
|
459
|
+
aggregated = normalized.group_by("category").agg([
|
|
460
|
+
pl.col("normalized").mean().alias("avg_normalized"),
|
|
461
|
+
pl.count().alias("count")
|
|
462
|
+
])
|
|
463
|
+
|
|
464
|
+
# Still lazy until we collect
|
|
465
|
+
print(f"Aggregated is lazy: {aggregated.is_lazy}") # True
|
|
466
|
+
|
|
467
|
+
# Materialize only the final result
|
|
468
|
+
result = aggregated.collect()
|
|
469
|
+
print(f"Result is lazy: {result.is_lazy}") # False
|
|
470
|
+
|
|
471
|
+
# Save the entire workspace with lazy evaluation preserved
|
|
472
|
+
workspace.serialize("lazy_analysis.json")
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
### Example 3: Multi-Source Data Integration
|
|
476
|
+
|
|
477
|
+
```python
|
|
478
|
+
import polars as pl
|
|
479
|
+
from docworkspace import Node, Workspace
|
|
480
|
+
|
|
481
|
+
workspace = Workspace("data_integration")
|
|
482
|
+
|
|
483
|
+
# Load data from multiple sources
|
|
484
|
+
sales_df = pl.DataFrame({
|
|
485
|
+
"customer_id": [1, 2, 3, 4],
|
|
486
|
+
"sales": [100, 200, 150, 300],
|
|
487
|
+
"region": ["North", "South", "East", "West"]
|
|
488
|
+
})
|
|
489
|
+
|
|
490
|
+
customer_df = pl.DataFrame({
|
|
491
|
+
"customer_id": [1, 2, 3, 4, 5],
|
|
492
|
+
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
|
|
493
|
+
"segment": ["Premium", "Regular", "Premium", "Regular", "Premium"]
|
|
494
|
+
})
|
|
495
|
+
|
|
496
|
+
# Add to workspace
|
|
497
|
+
sales_node = workspace.add_node(Node(sales_df, "sales_data"))
|
|
498
|
+
customer_node = workspace.add_node(Node(customer_df, "customer_data"))
|
|
499
|
+
|
|
500
|
+
# Join the datasets
|
|
501
|
+
combined = sales_node.join(customer_node, on="customer_id", how="inner")
|
|
502
|
+
|
|
503
|
+
# Analyze by segment
|
|
504
|
+
segment_analysis = combined.group_by("segment").agg([
|
|
505
|
+
pl.col("sales").sum().alias("total_sales"),
|
|
506
|
+
pl.col("sales").mean().alias("avg_sales"),
|
|
507
|
+
pl.count().alias("customer_count")
|
|
508
|
+
])
|
|
509
|
+
|
|
510
|
+
# Filter high-value segments
|
|
511
|
+
high_value = segment_analysis.filter(pl.col("total_sales") > 200)
|
|
512
|
+
|
|
513
|
+
print(f"Nodes in workspace: {len(workspace.nodes)}")
|
|
514
|
+
print("Data lineage:")
|
|
515
|
+
for node in workspace.get_leaf_nodes():
|
|
516
|
+
print(f"Leaf node: {node.name}")
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
## Development
|
|
520
|
+
|
|
521
|
+
### Running Tests
|
|
522
|
+
|
|
523
|
+
```bash
|
|
524
|
+
# Install development dependencies
|
|
525
|
+
pip install pytest
|
|
526
|
+
|
|
527
|
+
# Run all tests
|
|
528
|
+
pytest
|
|
529
|
+
|
|
530
|
+
# Run with coverage
|
|
531
|
+
pytest --cov=docworkspace
|
|
532
|
+
|
|
533
|
+
# Run specific test file
|
|
534
|
+
pytest tests/test_workspace.py -v
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
### Contributing
|
|
538
|
+
|
|
539
|
+
1. Fork the repository
|
|
540
|
+
2. Create a feature branch: `git checkout -b feature-name`
|
|
541
|
+
3. Make your changes and add tests
|
|
542
|
+
4. Run the test suite: `pytest`
|
|
543
|
+
5. Submit a pull request
|
|
544
|
+
|
|
545
|
+
### Project Structure
|
|
546
|
+
|
|
547
|
+
```
|
|
548
|
+
docworkspace/
|
|
549
|
+
├── docworkspace/ # Main package
|
|
550
|
+
│ ├── __init__.py # Package exports
|
|
551
|
+
│ ├── node.py # Node class implementation
|
|
552
|
+
│ ├── workspace.py # Workspace class implementation
|
|
553
|
+
│ ├── api_models.py # FastAPI Pydantic models
|
|
554
|
+
│ └── api_utils.py # FastAPI utility functions
|
|
555
|
+
├── tests/ # Test suite
|
|
556
|
+
│ ├── test_node.py # Node class tests
|
|
557
|
+
│ ├── test_workspace.py # Workspace class tests
|
|
558
|
+
│ ├── test_integration.py # Integration tests
|
|
559
|
+
│ └── test_coverage.py # Coverage tests
|
|
560
|
+
├── examples/ # Example scripts and data
|
|
561
|
+
├── README.md # This file
|
|
562
|
+
└── pyproject.toml # Project configuration
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
## License
|
|
566
|
+
|
|
567
|
+
Part of the LDaCA (Language Data Commons of Australia) ecosystem.
|
|
568
|
+
|
|
569
|
+
## Changelog
|
|
570
|
+
|
|
571
|
+
### Version 0.1.0
|
|
572
|
+
- Initial release
|
|
573
|
+
- Core Node and Workspace functionality
|
|
574
|
+
- Support for Polars and DocFrame data types
|
|
575
|
+
- Lazy evaluation support
|
|
576
|
+
- FastAPI integration
|
|
577
|
+
- Serialization capabilities
|
|
578
|
+
- Comprehensive test suite
|
|
579
|
+
|
|
580
|
+
## Related Projects
|
|
581
|
+
|
|
582
|
+
- **[DocFrame](https://github.com/ldaca/docframe)**: Enhanced DataFrames for text analysis
|
|
583
|
+
- **[LDaCA Web App](https://github.com/ldaca/ldaca_web_app)**: Full-stack web application using DocWorkspace
|
|
584
|
+
- **[Polars](https://pola.rs/)**: Fast DataFrame library with lazy evaluation
|