vcti-dataflow 2.0.0__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vcti_dataflow-3.0.0/PKG-INFO +167 -0
- vcti_dataflow-3.0.0/README.md +146 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/pyproject.toml +6 -4
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/__init__.py +3 -3
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/__init__.py +73 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/aliases.py +26 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/iterate.py +36 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/py.typed +0 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/reducers.py +116 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/sinks.py +45 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/sources.py +70 -0
- vcti_dataflow-3.0.0/src/vcti/flow/fieldset/transforms.py +192 -0
- vcti_dataflow-3.0.0/src/vcti_dataflow.egg-info/PKG-INFO +167 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/SOURCES.txt +9 -7
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/requires.txt +1 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/tests/test_data.py +16 -0
- vcti_dataflow-3.0.0/tests/test_fieldset.py +350 -0
- vcti_dataflow-2.0.0/PKG-INFO +0 -147
- vcti_dataflow-2.0.0/README.md +0 -127
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/__init__.py +0 -41
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/iterate.py +0 -48
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/merge.py +0 -53
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/sources.py +0 -94
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/transforms.py +0 -203
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/PKG-INFO +0 -147
- vcti_dataflow-2.0.0/tests/test_fields.py +0 -95
- vcti_dataflow-2.0.0/tests/test_shaping.py +0 -309
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/LICENSE +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/setup.cfg +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/aliases.py +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/py.typed +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/record.py +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/sources.py +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/dependency_links.txt +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/top_level.txt +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/zip-safe +0 -0
- {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/tests/test_version.py +0 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vcti-dataflow
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: vcti-flow bindings for array data: a DataNode payload binding and a FieldSet table binding (sources, transformers, reducers, sinks).
|
|
5
|
+
Author: Visual Collaboration Technologies Inc.
|
|
6
|
+
Requires-Python: <3.15,>=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: vcti-flow>=2.0.0
|
|
10
|
+
Requires-Dist: vcti-datanode>=2.0.0
|
|
11
|
+
Requires-Dist: vcti-fieldset[datanode]>=2.0.0
|
|
12
|
+
Requires-Dist: numpy>=1.24
|
|
13
|
+
Provides-Extra: test
|
|
14
|
+
Requires-Dist: pytest; extra == "test"
|
|
15
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
16
|
+
Provides-Extra: lint
|
|
17
|
+
Requires-Dist: ruff; extra == "lint"
|
|
18
|
+
Provides-Extra: typecheck
|
|
19
|
+
Requires-Dist: mypy; extra == "typecheck"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# Data Flow
|
|
23
|
+
|
|
24
|
+
vcti-flow bindings for array data: a **DataNode** payload binding and a
|
|
25
|
+
**FieldSet** table binding — sources, transformers, reducers, and sinks.
|
|
26
|
+
|
|
27
|
+
## Overview
|
|
28
|
+
|
|
29
|
+
[vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
|
|
30
|
+
framework for composing flow graphs — it never inspects the values flowing
|
|
31
|
+
through it. This package binds that framework to two concrete payloads so you
|
|
32
|
+
subclass ready-bound node kinds instead of writing `Source[...]` everywhere:
|
|
33
|
+
|
|
34
|
+
- **`vcti.flow.data`** — the [vcti-datanode](https://github.com/vcollab/vcti-python-datanode)
|
|
35
|
+
`DataNode` binding: one array plus layered attributes behind a data source.
|
|
36
|
+
Node kinds `Source` / `Transformer` / `Reducer` / `Sink` / `Observer`, plus
|
|
37
|
+
`from_array` (eager) and `ArraySource` (lazy) payload builders.
|
|
38
|
+
- **`vcti.flow.fieldset`** — the [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset)
|
|
39
|
+
`FieldSet` binding: a **table** of named columns. This is where column and row
|
|
40
|
+
reshaping lives — select / drop / rename / cast / compute, filter / sort /
|
|
41
|
+
slice, and the N→1 combiners merge / concat. The nodes are thin wrappers over
|
|
42
|
+
FieldSet's own copy-free `project()` / `rows()` builders.
|
|
43
|
+
|
|
44
|
+
Use the DataNode binding for single-array flows; use the FieldSet binding to
|
|
45
|
+
reshape multi-column tables. `DataNodesSource` / `WriteDataNodes` bridge between
|
|
46
|
+
them (a group of DataNodes ⇆ a FieldSet).
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install vcti-dataflow
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### In `pyproject.toml` dependencies
|
|
55
|
+
|
|
56
|
+
```toml
|
|
57
|
+
dependencies = [
|
|
58
|
+
"vcti-dataflow>=3.0.0",
|
|
59
|
+
]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Quick Start — DataNode binding
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import numpy as np
|
|
68
|
+
from vcti.flow.data import Source, Transformer, DataNode, from_array
|
|
69
|
+
|
|
70
|
+
# A source produces a DataNode
|
|
71
|
+
class Stress(Source):
|
|
72
|
+
def load(self) -> DataNode:
|
|
73
|
+
return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
|
|
74
|
+
|
|
75
|
+
# A transformer maps one DataNode to another
|
|
76
|
+
class Scale(Transformer):
|
|
77
|
+
def __init__(self, factor: float) -> None:
|
|
78
|
+
super().__init__()
|
|
79
|
+
self.factor = factor
|
|
80
|
+
|
|
81
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
82
|
+
return from_array(record.load() * self.factor, record.attributes)
|
|
83
|
+
|
|
84
|
+
result = Scale(2.0).connect(Stress()).execute()
|
|
85
|
+
result.load() # array([2., 4., 6.])
|
|
86
|
+
result.attributes["units"] # "MPa"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Reach for the array only when you need it (`record.load()`); a leaf source can
|
|
90
|
+
hand back a `LazyDataSource`-backed node (or use `ArraySource`) to defer a heavy
|
|
91
|
+
read.
|
|
92
|
+
|
|
93
|
+
## Quick Start — FieldSet binding (tables)
|
|
94
|
+
|
|
95
|
+
The `vcti.flow.fieldset` nodes carry a `FieldSet` (named columns) and reshape it.
|
|
96
|
+
Column ops (`select` / `drop` / `rename` / `cast`) share columns by reference —
|
|
97
|
+
no data copy; `cast` is a lazy per-column cast. Row ops fold to a single index.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import numpy as np
|
|
101
|
+
from vcti.datanode import DataNode, EagerDataSource
|
|
102
|
+
from vcti.flow.fieldset import (
|
|
103
|
+
DataNodesSource, SelectFields, RenameFields, CastFields, ComputeFields,
|
|
104
|
+
FilterRows, SortRows, MergeFields, ConcatRows, WriteDataNodes,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Read a group of DataNodes as one FieldSet (lazy — nothing materialises here)
|
|
108
|
+
src = DataNodesSource([
|
|
109
|
+
DataNode(name="id", data_source=EagerDataSource(np.array([1, 2, 3]))),
|
|
110
|
+
DataNode(name="stress", data_source=EagerDataSource(np.array([10.0, 20.0, 30.0]))),
|
|
111
|
+
])
|
|
112
|
+
|
|
113
|
+
# Reshape: rename, add a computed column, filter and sort rows
|
|
114
|
+
flow = RenameFields({"stress": "s"}).connect(src)
|
|
115
|
+
flow = ComputeFields("s_kpa = s * 1000").connect(flow)
|
|
116
|
+
flow = FilterRows("s > 10").connect(flow)
|
|
117
|
+
flow = SortRows("s", descending=True).connect(flow)
|
|
118
|
+
|
|
119
|
+
result = flow.execute() # a FieldSet
|
|
120
|
+
result.get_values("s_kpa") # array([30000., 20000.])
|
|
121
|
+
|
|
122
|
+
# Combine tables — N→1 reducers own the policy (collision / schema)
|
|
123
|
+
wide = MergeFields().connect(flowA).connect(flowB) # union columns
|
|
124
|
+
tall = ConcatRows().connect(flowA).connect(flowB) # stack rows
|
|
125
|
+
|
|
126
|
+
# Write out: FieldSet -> DataNodes -> your writer (I/O stays in your app)
|
|
127
|
+
WriteDataNodes(lambda node: write_to_h5(node)).connect(flow).execute()
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Column reshaping is `project()`-backed; row reshaping is `rows()`-backed. See
|
|
131
|
+
[docs/patterns.md](docs/patterns.md) for the full recipe set.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## API
|
|
136
|
+
|
|
137
|
+
### `vcti.flow.data` — DataNode binding
|
|
138
|
+
|
|
139
|
+
| Symbol | Purpose |
|
|
140
|
+
|--------|---------|
|
|
141
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `DataNode` |
|
|
142
|
+
| `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
|
|
143
|
+
| `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array |
|
|
144
|
+
| `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` |
|
|
145
|
+
|
|
146
|
+
### `vcti.flow.fieldset` — FieldSet binding
|
|
147
|
+
|
|
148
|
+
| Symbol | Purpose |
|
|
149
|
+
|--------|---------|
|
|
150
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `FieldSet` |
|
|
151
|
+
| `SelectFields` / `DropFields` / `RenameFields` | Keep / remove / rename columns (share sources, no copy) |
|
|
152
|
+
| `CastFields` / `ComputeFields` / `FreezeFields` | Cast dtypes (lazy), add expression columns, bake an expression to data |
|
|
153
|
+
| `FilterRows` / `SortRows` / `HeadRows` / `SliceRows` | Row selection (fold to one index; slice stays a view) |
|
|
154
|
+
| `MergeFields` / `ConcatRows` | N→1: union columns / stack rows (own collision & schema policy) |
|
|
155
|
+
| `FieldSetSource` | Leaf source over an existing `FieldSet` (or a callable returning one) |
|
|
156
|
+
| `DataNodesSource` / `WriteDataNodes` | Boundary: DataNodes → FieldSet / FieldSet → DataNodes |
|
|
157
|
+
| `for_each_group(source, key_field, factory)` | Fan out into one flow per group |
|
|
158
|
+
| `from_datanodes` / `to_datanodes` | Re-exported from `vcti-fieldset` |
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Dependencies
|
|
163
|
+
|
|
164
|
+
- [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
|
|
165
|
+
- [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
|
|
166
|
+
- [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset) `[datanode]` (>=2.0.0) — the `FieldSet` table payload and its copy-free reshaping
|
|
167
|
+
- [numpy](https://numpy.org/) (>=1.24)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Data Flow
|
|
2
|
+
|
|
3
|
+
vcti-flow bindings for array data: a **DataNode** payload binding and a
|
|
4
|
+
**FieldSet** table binding — sources, transformers, reducers, and sinks.
|
|
5
|
+
|
|
6
|
+
## Overview
|
|
7
|
+
|
|
8
|
+
[vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
|
|
9
|
+
framework for composing flow graphs — it never inspects the values flowing
|
|
10
|
+
through it. This package binds that framework to two concrete payloads so you
|
|
11
|
+
subclass ready-bound node kinds instead of writing `Source[...]` everywhere:
|
|
12
|
+
|
|
13
|
+
- **`vcti.flow.data`** — the [vcti-datanode](https://github.com/vcollab/vcti-python-datanode)
|
|
14
|
+
`DataNode` binding: one array plus layered attributes behind a data source.
|
|
15
|
+
Node kinds `Source` / `Transformer` / `Reducer` / `Sink` / `Observer`, plus
|
|
16
|
+
`from_array` (eager) and `ArraySource` (lazy) payload builders.
|
|
17
|
+
- **`vcti.flow.fieldset`** — the [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset)
|
|
18
|
+
`FieldSet` binding: a **table** of named columns. This is where column and row
|
|
19
|
+
reshaping lives — select / drop / rename / cast / compute, filter / sort /
|
|
20
|
+
slice, and the N→1 combiners merge / concat. The nodes are thin wrappers over
|
|
21
|
+
FieldSet's own copy-free `project()` / `rows()` builders.
|
|
22
|
+
|
|
23
|
+
Use the DataNode binding for single-array flows; use the FieldSet binding to
|
|
24
|
+
reshape multi-column tables. `DataNodesSource` / `WriteDataNodes` bridge between
|
|
25
|
+
them (a group of DataNodes ⇆ a FieldSet).
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install vcti-dataflow
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### In `pyproject.toml` dependencies
|
|
34
|
+
|
|
35
|
+
```toml
|
|
36
|
+
dependencies = [
|
|
37
|
+
"vcti-dataflow>=3.0.0",
|
|
38
|
+
]
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Quick Start — DataNode binding
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import numpy as np
|
|
47
|
+
from vcti.flow.data import Source, Transformer, DataNode, from_array
|
|
48
|
+
|
|
49
|
+
# A source produces a DataNode
|
|
50
|
+
class Stress(Source):
|
|
51
|
+
def load(self) -> DataNode:
|
|
52
|
+
return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
|
|
53
|
+
|
|
54
|
+
# A transformer maps one DataNode to another
|
|
55
|
+
class Scale(Transformer):
|
|
56
|
+
def __init__(self, factor: float) -> None:
|
|
57
|
+
super().__init__()
|
|
58
|
+
self.factor = factor
|
|
59
|
+
|
|
60
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
61
|
+
return from_array(record.load() * self.factor, record.attributes)
|
|
62
|
+
|
|
63
|
+
result = Scale(2.0).connect(Stress()).execute()
|
|
64
|
+
result.load() # array([2., 4., 6.])
|
|
65
|
+
result.attributes["units"] # "MPa"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Reach for the array only when you need it (`record.load()`); a leaf source can
|
|
69
|
+
hand back a `LazyDataSource`-backed node (or use `ArraySource`) to defer a heavy
|
|
70
|
+
read.
|
|
71
|
+
|
|
72
|
+
## Quick Start — FieldSet binding (tables)
|
|
73
|
+
|
|
74
|
+
The `vcti.flow.fieldset` nodes carry a `FieldSet` (named columns) and reshape it.
|
|
75
|
+
Column ops (`select` / `drop` / `rename` / `cast`) share columns by reference —
|
|
76
|
+
no data copy; `cast` is a lazy per-column cast. Row ops fold to a single index.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import numpy as np
|
|
80
|
+
from vcti.datanode import DataNode, EagerDataSource
|
|
81
|
+
from vcti.flow.fieldset import (
|
|
82
|
+
DataNodesSource, SelectFields, RenameFields, CastFields, ComputeFields,
|
|
83
|
+
FilterRows, SortRows, MergeFields, ConcatRows, WriteDataNodes,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Read a group of DataNodes as one FieldSet (lazy — nothing materialises here)
|
|
87
|
+
src = DataNodesSource([
|
|
88
|
+
DataNode(name="id", data_source=EagerDataSource(np.array([1, 2, 3]))),
|
|
89
|
+
DataNode(name="stress", data_source=EagerDataSource(np.array([10.0, 20.0, 30.0]))),
|
|
90
|
+
])
|
|
91
|
+
|
|
92
|
+
# Reshape: rename, add a computed column, filter and sort rows
|
|
93
|
+
flow = RenameFields({"stress": "s"}).connect(src)
|
|
94
|
+
flow = ComputeFields("s_kpa = s * 1000").connect(flow)
|
|
95
|
+
flow = FilterRows("s > 10").connect(flow)
|
|
96
|
+
flow = SortRows("s", descending=True).connect(flow)
|
|
97
|
+
|
|
98
|
+
result = flow.execute() # a FieldSet
|
|
99
|
+
result.get_values("s_kpa") # array([30000., 20000.])
|
|
100
|
+
|
|
101
|
+
# Combine tables — N→1 reducers own the policy (collision / schema)
|
|
102
|
+
wide = MergeFields().connect(flowA).connect(flowB) # union columns
|
|
103
|
+
tall = ConcatRows().connect(flowA).connect(flowB) # stack rows
|
|
104
|
+
|
|
105
|
+
# Write out: FieldSet -> DataNodes -> your writer (I/O stays in your app)
|
|
106
|
+
WriteDataNodes(lambda node: write_to_h5(node)).connect(flow).execute()
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Column reshaping is `project()`-backed; row reshaping is `rows()`-backed. See
|
|
110
|
+
[docs/patterns.md](docs/patterns.md) for the full recipe set.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## API
|
|
115
|
+
|
|
116
|
+
### `vcti.flow.data` — DataNode binding
|
|
117
|
+
|
|
118
|
+
| Symbol | Purpose |
|
|
119
|
+
|--------|---------|
|
|
120
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `DataNode` |
|
|
121
|
+
| `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
|
|
122
|
+
| `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array |
|
|
123
|
+
| `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` |
|
|
124
|
+
|
|
125
|
+
### `vcti.flow.fieldset` — FieldSet binding
|
|
126
|
+
|
|
127
|
+
| Symbol | Purpose |
|
|
128
|
+
|--------|---------|
|
|
129
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `FieldSet` |
|
|
130
|
+
| `SelectFields` / `DropFields` / `RenameFields` | Keep / remove / rename columns (share sources, no copy) |
|
|
131
|
+
| `CastFields` / `ComputeFields` / `FreezeFields` | Cast dtypes (lazy), add expression columns, bake an expression to data |
|
|
132
|
+
| `FilterRows` / `SortRows` / `HeadRows` / `SliceRows` | Row selection (fold to one index; slice stays a view) |
|
|
133
|
+
| `MergeFields` / `ConcatRows` | N→1: union columns / stack rows (own collision & schema policy) |
|
|
134
|
+
| `FieldSetSource` | Leaf source over an existing `FieldSet` (or a callable returning one) |
|
|
135
|
+
| `DataNodesSource` / `WriteDataNodes` | Boundary: DataNodes → FieldSet / FieldSet → DataNodes |
|
|
136
|
+
| `for_each_group(source, key_field, factory)` | Fan out into one flow per group |
|
|
137
|
+
| `from_datanodes` / `to_datanodes` | Re-exported from `vcti-fieldset` |
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Dependencies
|
|
142
|
+
|
|
143
|
+
- [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
|
|
144
|
+
- [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
|
|
145
|
+
- [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset) `[datanode]` (>=2.0.0) — the `FieldSet` table payload and its copy-free reshaping
|
|
146
|
+
- [numpy](https://numpy.org/) (>=1.24)
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "vcti-dataflow"
|
|
7
|
-
version = "
|
|
8
|
-
description = "
|
|
7
|
+
version = "3.0.0"
|
|
8
|
+
description = "vcti-flow bindings for array data: a DataNode payload binding and a FieldSet table binding (sources, transformers, reducers, sinks)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
11
11
|
{name = "Visual Collaboration Technologies Inc."}
|
|
@@ -14,16 +14,18 @@ requires-python = ">=3.12,<3.15"
|
|
|
14
14
|
dependencies = [
|
|
15
15
|
"vcti-flow>=2.0.0",
|
|
16
16
|
"vcti-datanode>=2.0.0",
|
|
17
|
+
"vcti-fieldset[datanode]>=2.0.0",
|
|
17
18
|
"numpy>=1.24",
|
|
18
19
|
]
|
|
19
20
|
|
|
20
21
|
[tool.setuptools.packages.find]
|
|
21
22
|
where = ["src"]
|
|
22
|
-
include = ["vcti.flow.data", "vcti.flow.data.*"]
|
|
23
|
+
include = ["vcti.flow.data", "vcti.flow.data.*", "vcti.flow.fieldset", "vcti.flow.fieldset.*"]
|
|
23
24
|
namespaces = true
|
|
24
25
|
|
|
25
26
|
[tool.setuptools.package-data]
|
|
26
27
|
"vcti.flow.data" = ["py.typed"]
|
|
28
|
+
"vcti.flow.fieldset" = ["py.typed"]
|
|
27
29
|
|
|
28
30
|
[project.optional-dependencies]
|
|
29
31
|
test = ["pytest", "pytest-cov"]
|
|
@@ -34,7 +36,7 @@ typecheck = ["mypy"]
|
|
|
34
36
|
zip-safe = true
|
|
35
37
|
|
|
36
38
|
[tool.pytest.ini_options]
|
|
37
|
-
addopts = "--cov=vcti.flow.data --cov-report=term-missing --cov-fail-under=95"
|
|
39
|
+
addopts = "--cov=vcti.flow.data --cov=vcti.flow.fieldset --cov-report=term-missing --cov-fail-under=95"
|
|
38
40
|
|
|
39
41
|
[tool.mypy]
|
|
40
42
|
python_version = "3.12"
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
DataNode-bound node kinds (``Source``, ``Transformer``, ``Reducer``, ``Sink``,
|
|
6
6
|
``Observer``) over the generic ``vcti.flow`` framework, plus ``from_array`` (eager)
|
|
7
|
-
and ``ArraySource`` (lazy) for building ``DataNode`` payloads.
|
|
8
|
-
|
|
9
|
-
``vcti.flow.
|
|
7
|
+
and ``ArraySource`` (lazy) for building ``DataNode`` payloads. This binding is for
|
|
8
|
+
single-array flows; table (named-column) reshaping lives in the sibling
|
|
9
|
+
``vcti.flow.fieldset`` binding over a ``FieldSet`` payload.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
from importlib.metadata import version
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""vcti.flow.fieldset — the FieldSet binding of vcti.flow.
|
|
4
|
+
|
|
5
|
+
FieldSet-bound node kinds (``Source``, ``Transformer``, ``Reducer``, ``Sink``,
|
|
6
|
+
``Observer``) over the generic ``vcti.flow`` framework, plus flow nodes that wrap
|
|
7
|
+
``vcti.fieldset.FieldSet``'s copy-free reshaping:
|
|
8
|
+
|
|
9
|
+
- Column transformers (1->1, over ``FieldSet.project()`` / ``expressions``):
|
|
10
|
+
``SelectFields``, ``DropFields``, ``RenameFields``, ``CastFields``,
|
|
11
|
+
``ComputeFields``, ``FreezeFields``.
|
|
12
|
+
- Row transformers (1->1, over ``FieldSet.rows()``): ``FilterRows``,
|
|
13
|
+
``SortRows``, ``HeadRows``, ``SliceRows``.
|
|
14
|
+
- Reducers (N->1): ``MergeFields`` (union columns), ``ConcatRows`` (stack rows).
|
|
15
|
+
- Sources: ``FieldSetSource`` (an existing FieldSet) and ``DataNodesSource``
|
|
16
|
+
(DataNodes -> FieldSet).
|
|
17
|
+
- Sink / boundary: ``WriteDataNodes`` (FieldSet -> DataNodes), plus re-exported
|
|
18
|
+
``from_datanodes`` / ``to_datanodes``.
|
|
19
|
+
- Group fan-out: ``for_each_group``.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from importlib.metadata import version
|
|
23
|
+
|
|
24
|
+
from vcti.fieldset import FieldSet
|
|
25
|
+
from vcti.fieldset.datanode import from_datanodes, to_datanodes
|
|
26
|
+
|
|
27
|
+
from .aliases import Observer, Reducer, Sink, Source, Transformer
|
|
28
|
+
from .iterate import for_each_group
|
|
29
|
+
from .reducers import ConcatRows, MergeFields
|
|
30
|
+
from .sinks import WriteDataNodes
|
|
31
|
+
from .sources import DataNodesSource, FieldSetSource
|
|
32
|
+
from .transforms import (
|
|
33
|
+
CastFields,
|
|
34
|
+
ComputeFields,
|
|
35
|
+
DropFields,
|
|
36
|
+
FilterRows,
|
|
37
|
+
FreezeFields,
|
|
38
|
+
HeadRows,
|
|
39
|
+
RenameFields,
|
|
40
|
+
SelectFields,
|
|
41
|
+
SliceRows,
|
|
42
|
+
SortRows,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
__version__ = version("vcti-dataflow")
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"__version__",
|
|
49
|
+
"CastFields",
|
|
50
|
+
"ComputeFields",
|
|
51
|
+
"ConcatRows",
|
|
52
|
+
"DataNodesSource",
|
|
53
|
+
"DropFields",
|
|
54
|
+
"FieldSet",
|
|
55
|
+
"FieldSetSource",
|
|
56
|
+
"FilterRows",
|
|
57
|
+
"FreezeFields",
|
|
58
|
+
"HeadRows",
|
|
59
|
+
"MergeFields",
|
|
60
|
+
"Observer",
|
|
61
|
+
"Reducer",
|
|
62
|
+
"RenameFields",
|
|
63
|
+
"SelectFields",
|
|
64
|
+
"Sink",
|
|
65
|
+
"SliceRows",
|
|
66
|
+
"SortRows",
|
|
67
|
+
"Source",
|
|
68
|
+
"Transformer",
|
|
69
|
+
"WriteDataNodes",
|
|
70
|
+
"for_each_group",
|
|
71
|
+
"from_datanodes",
|
|
72
|
+
"to_datanodes",
|
|
73
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""FieldSet-bound aliases for the vcti.flow node kinds.
|
|
4
|
+
|
|
5
|
+
These bind the generic ``vcti.flow`` framework to the ``vcti.fieldset.FieldSet``
|
|
6
|
+
payload, so authors subclass ``Source`` / ``Transformer`` / … — the
|
|
7
|
+
``vcti.flow.fieldset`` spelling of ``Source[FieldSet]`` /
|
|
8
|
+
``Transformer[FieldSet, FieldSet]`` — instead of repeating the type parameter.
|
|
9
|
+
They are plain assignments (not PEP 695 ``type`` aliases) so they remain usable
|
|
10
|
+
as base classes.
|
|
11
|
+
|
|
12
|
+
The names intentionally shadow the generic ``vcti.flow.core`` kinds: within this
|
|
13
|
+
binding ``Transformer`` *is* ``Transformer[FieldSet, FieldSet]``. This is the
|
|
14
|
+
FieldSet counterpart of the ``DataNode`` binding in ``vcti.flow.data``.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import vcti.flow.core as core
|
|
20
|
+
from vcti.fieldset import FieldSet
|
|
21
|
+
|
|
22
|
+
Source = core.Source[FieldSet]
|
|
23
|
+
Transformer = core.Transformer[FieldSet, FieldSet]
|
|
24
|
+
Reducer = core.Reducer[FieldSet, FieldSet]
|
|
25
|
+
Sink = core.Sink[FieldSet]
|
|
26
|
+
Observer = core.Observer[FieldSet]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Group fan-out over a FieldSet, as a flow combinator."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Iterator
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from vcti.fieldset import FieldSet
|
|
11
|
+
from vcti.flow.core import Node
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def for_each_group[U](
|
|
15
|
+
source: Node[FieldSet],
|
|
16
|
+
key_field: str,
|
|
17
|
+
factory: Callable[[FieldSet], Node[U]],
|
|
18
|
+
) -> Iterator[tuple[Any, Node[U]]]:
|
|
19
|
+
"""Fan a FieldSet out into one flow per group, keyed by *key_field*.
|
|
20
|
+
|
|
21
|
+
Executes *source* once, partitions its rows with ``rows().group_by(key_field)``
|
|
22
|
+
(each group is a row-view FieldSet sharing the base's columns), and yields
|
|
23
|
+
``(key, factory(group))`` per distinct key. The FieldSet-specific analogue of
|
|
24
|
+
``vcti.flow.core.for_each`` for the grouped case.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
source: A node producing the FieldSet to partition.
|
|
28
|
+
key_field: Field (or expression) name whose distinct values define groups.
|
|
29
|
+
factory: Builds a downstream node from each group's FieldSet.
|
|
30
|
+
|
|
31
|
+
Yields:
|
|
32
|
+
``(key_value, node)`` pairs, one per group, key order ascending.
|
|
33
|
+
"""
|
|
34
|
+
base = source.execute()
|
|
35
|
+
for key, group in base.rows().group_by(key_field):
|
|
36
|
+
yield key, factory(group)
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""N->1 reducers over FieldSet payloads.
|
|
4
|
+
|
|
5
|
+
These combine *multiple* FieldSets into one — the arity FieldSet itself does not
|
|
6
|
+
cover (its ``project`` / ``rows`` are 1->1). ``MergeFields`` unions columns
|
|
7
|
+
(horizontal); ``ConcatRows`` stacks rows (vertical). Combination policy —
|
|
8
|
+
name-collision handling, schema matching — lives here, in the pipeline, not in
|
|
9
|
+
FieldSet.
|
|
10
|
+
|
|
11
|
+
Only stored fields are combined; registered **expressions are not carried** (they
|
|
12
|
+
are per-input and may reference dropped columns). Freeze an expression
|
|
13
|
+
(``FreezeFields``) before combining if you need it in the output. Per-field
|
|
14
|
+
properties (components, user attributes) of the combined fields are carried over.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
from vcti.fieldset import ArrayFieldSource, FieldSet
|
|
21
|
+
|
|
22
|
+
from .aliases import Reducer
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _carry_field_properties(src: FieldSet, dst: FieldSet, names: set[str]) -> None:
|
|
26
|
+
"""Copy per-field properties (components + user attributes) for ``names``.
|
|
27
|
+
|
|
28
|
+
Reaches into the property store via the same ``to_dict`` / ``key_to_parts`` /
|
|
29
|
+
``set`` path FieldSet's own projection uses. Keys missing an expected part are
|
|
30
|
+
skipped rather than raising, so a future store-schema change degrades to
|
|
31
|
+
"metadata not carried" instead of a hard error.
|
|
32
|
+
"""
|
|
33
|
+
store = src.properties.store
|
|
34
|
+
mapper = store.key_mapper
|
|
35
|
+
for key, value in store.to_dict().items():
|
|
36
|
+
_, parts = mapper.key_to_parts(key)
|
|
37
|
+
column = parts.get("column")
|
|
38
|
+
scope = parts.get("scope")
|
|
39
|
+
name = parts.get("name")
|
|
40
|
+
if column in names and scope is not None and name is not None:
|
|
41
|
+
dst.properties.store.set(scope=scope, column=column, name=name, value=value)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MergeFields(Reducer):
|
|
45
|
+
"""Union the columns of several FieldSets into one (same row count).
|
|
46
|
+
|
|
47
|
+
Fields are shared by reference (zero-copy); row counts must match (enforced by
|
|
48
|
+
``FieldSet``). ``on_conflict`` decides duplicate output names:
|
|
49
|
+
|
|
50
|
+
- ``"error"`` (default) — raise on a name that already exists.
|
|
51
|
+
- ``"first"`` — keep the first occurrence, ignore later ones.
|
|
52
|
+
- ``"last"`` — later occurrences replace earlier ones.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, *, on_conflict: str = "error", name: str | None = None) -> None:
|
|
56
|
+
super().__init__(name=name)
|
|
57
|
+
if on_conflict not in ("error", "first", "last"):
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"on_conflict must be 'error', 'first', or 'last', got {on_conflict!r}"
|
|
60
|
+
)
|
|
61
|
+
self._on_conflict = on_conflict
|
|
62
|
+
|
|
63
|
+
def reduce(self, records: list[FieldSet]) -> FieldSet:
|
|
64
|
+
result = FieldSet()
|
|
65
|
+
origin: dict[str, FieldSet] = {}
|
|
66
|
+
for fs in records:
|
|
67
|
+
for field in fs.fields.names:
|
|
68
|
+
if field in result.fields:
|
|
69
|
+
if self._on_conflict == "error":
|
|
70
|
+
raise ValueError(f"MergeFields: duplicate field name {field!r}")
|
|
71
|
+
if self._on_conflict == "first":
|
|
72
|
+
continue
|
|
73
|
+
result.fields.remove(field) # "last" wins
|
|
74
|
+
result.fields.add(field, fs.fields.source(field))
|
|
75
|
+
origin[field] = fs
|
|
76
|
+
# Carry properties once per source (deep-copies the store once per source,
|
|
77
|
+
# not once per field): group each source with the fields it contributed.
|
|
78
|
+
grouped: dict[int, tuple[FieldSet, set[str]]] = {}
|
|
79
|
+
for field, src in origin.items():
|
|
80
|
+
grouped.setdefault(id(src), (src, set()))[1].add(field)
|
|
81
|
+
for src, fields in grouped.values():
|
|
82
|
+
_carry_field_properties(src, result, fields)
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ConcatRows(Reducer):
|
|
87
|
+
"""Stack the rows of several same-schema FieldSets into one.
|
|
88
|
+
|
|
89
|
+
Each stored field is concatenated across inputs (``numpy.concatenate`` — one
|
|
90
|
+
copy, unavoidable). All inputs must expose the **same stored field names**
|
|
91
|
+
(order-independent); the output takes the first input's field order. An input
|
|
92
|
+
with no stored fields therefore matches only when *every* input is field-less,
|
|
93
|
+
in which case an empty FieldSet is returned — a field-less input mixed with
|
|
94
|
+
data-bearing ones is a schema mismatch, not silently dropped. Registered
|
|
95
|
+
expressions are not concatenated; freeze them first (``FreezeFields``).
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If inputs have different stored-field schemas.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def reduce(self, records: list[FieldSet]) -> FieldSet:
|
|
102
|
+
if not records:
|
|
103
|
+
return FieldSet()
|
|
104
|
+
names = records[0].fields.names
|
|
105
|
+
schema = set(names)
|
|
106
|
+
for fs in records[1:]:
|
|
107
|
+
if set(fs.fields.names) != schema:
|
|
108
|
+
raise ValueError(f"ConcatRows: inconsistent schema {fs.fields.names} != {names}")
|
|
109
|
+
if not names:
|
|
110
|
+
return FieldSet()
|
|
111
|
+
result = FieldSet()
|
|
112
|
+
for field in names:
|
|
113
|
+
stacked = np.concatenate([fs.get_values(field) for fs in records])
|
|
114
|
+
result.fields.add(field, ArrayFieldSource(stacked))
|
|
115
|
+
_carry_field_properties(records[0], result, schema)
|
|
116
|
+
return result
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Sinks that export a FieldSet payload."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
|
|
9
|
+
from vcti.datanode import DataNode
|
|
10
|
+
from vcti.fieldset import FieldSet
|
|
11
|
+
from vcti.fieldset.datanode import to_datanodes
|
|
12
|
+
|
|
13
|
+
from .aliases import Sink
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class WriteDataNodes(Sink):
|
|
17
|
+
"""Export a FieldSet's fields as DataNodes and hand each to a writer.
|
|
18
|
+
|
|
19
|
+
Wraps ``vcti.fieldset.datanode.to_datanodes``: each field (and, by default,
|
|
20
|
+
each expression) becomes an eager ``DataNode`` — components map to
|
|
21
|
+
``node.intrinsic['components']`` and per-field user attributes to the node's
|
|
22
|
+
enriched attributes. Each node is passed to *writer*; the actual I/O is the
|
|
23
|
+
caller's (this package does no I/O itself). Being a ``Sink``, it passes the
|
|
24
|
+
FieldSet through unchanged.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
writer: Called once per exported node, ``writer(node) -> None``.
|
|
28
|
+
fields: Optional subset of field/expression names to export (default: all).
|
|
29
|
+
name: Optional node name.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
writer: Callable[[DataNode], None],
|
|
35
|
+
*,
|
|
36
|
+
fields: list[str] | None = None,
|
|
37
|
+
name: str | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
super().__init__(name=name)
|
|
40
|
+
self._writer = writer
|
|
41
|
+
self._fields = fields
|
|
42
|
+
|
|
43
|
+
def save(self, record: FieldSet) -> None:
|
|
44
|
+
for node in to_datanodes(record, fields=self._fields):
|
|
45
|
+
self._writer(node)
|