vcti-dataflow 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vcti_dataflow-2.0.0/LICENSE +8 -0
- vcti_dataflow-2.0.0/PKG-INFO +147 -0
- vcti_dataflow-2.0.0/README.md +127 -0
- vcti_dataflow-2.0.0/pyproject.toml +64 -0
- vcti_dataflow-2.0.0/setup.cfg +4 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/__init__.py +40 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/aliases.py +27 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/__init__.py +41 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/iterate.py +48 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/merge.py +53 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/sources.py +94 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/fields/transforms.py +203 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/py.typed +0 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/record.py +34 -0
- vcti_dataflow-2.0.0/src/vcti/flow/data/sources.py +40 -0
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/PKG-INFO +147 -0
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/SOURCES.txt +23 -0
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/dependency_links.txt +1 -0
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/requires.txt +13 -0
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/top_level.txt +1 -0
- vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/zip-safe +1 -0
- vcti_dataflow-2.0.0/tests/test_data.py +81 -0
- vcti_dataflow-2.0.0/tests/test_fields.py +95 -0
- vcti_dataflow-2.0.0/tests/test_shaping.py +309 -0
- vcti_dataflow-2.0.0/tests/test_version.py +15 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Copyright (c) 2018-2026 Visual Collaboration Technologies Inc.
|
|
2
|
+
All Rights Reserved.
|
|
3
|
+
|
|
4
|
+
This software is proprietary and confidential. Unauthorized copying,
|
|
5
|
+
distribution, or use of this software, via any medium, is strictly
|
|
6
|
+
prohibited. Access is granted only to authorized VCollab developers
|
|
7
|
+
and individuals explicitly authorized by Visual Collaboration
|
|
8
|
+
Technologies Inc.
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vcti-dataflow
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads.
|
|
5
|
+
Author: Visual Collaboration Technologies Inc.
|
|
6
|
+
Requires-Python: <3.15,>=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: vcti-flow>=2.0.0
|
|
10
|
+
Requires-Dist: vcti-datanode>=2.0.0
|
|
11
|
+
Requires-Dist: numpy>=1.24
|
|
12
|
+
Provides-Extra: test
|
|
13
|
+
Requires-Dist: pytest; extra == "test"
|
|
14
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
15
|
+
Provides-Extra: lint
|
|
16
|
+
Requires-Dist: ruff; extra == "lint"
|
|
17
|
+
Provides-Extra: typecheck
|
|
18
|
+
Requires-Dist: mypy; extra == "typecheck"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# Data Flow
|
|
22
|
+
|
|
23
|
+
The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads.
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
[vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
|
|
28
|
+
framework for composing flow graphs — it never inspects the values flowing
|
|
29
|
+
through it. `vcti.flow.data` binds that framework to the
|
|
30
|
+
[vcti-datanode](https://github.com/vcollab/vcti-python-datanode) `DataNode`
|
|
31
|
+
payload (data plus layered attributes behind a data source), so you get
|
|
32
|
+
familiar, ready-bound node kinds — `Source`, `Transformer`, `Reducer`, `Sink` —
|
|
33
|
+
instead of writing `Source[DataNode]` everywhere, plus `from_array` (eager) and
|
|
34
|
+
`ArraySource` (lazy) for building payloads.
|
|
35
|
+
|
|
36
|
+
These node-kind names are the `vcti.flow` kinds bound to `DataNode`: in this
|
|
37
|
+
package `Source` *is* `Source[DataNode]`. It is a flow leaf node — distinct from
|
|
38
|
+
`vcti.datanode.DataSource`, the array-source ABC re-exported here as
|
|
39
|
+
`EagerDataSource` / `LazyDataSource`.
|
|
40
|
+
|
|
41
|
+
Nodes that need a *structured* (named-field) array — field-wise merge and
|
|
42
|
+
row-keyed iteration — live in the `vcti.flow.data.fields` submodule; the base
|
|
43
|
+
binding makes no assumption about array shape.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install vcti-dataflow
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### In `pyproject.toml` dependencies
|
|
52
|
+
|
|
53
|
+
```toml
|
|
54
|
+
dependencies = [
|
|
55
|
+
"vcti-dataflow>=2.0.0",
|
|
56
|
+
]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import numpy as np
|
|
65
|
+
from vcti.flow.data import Source, Transformer, DataNode, from_array
|
|
66
|
+
|
|
67
|
+
# A source produces a DataNode
|
|
68
|
+
class Stress(Source):
|
|
69
|
+
def load(self) -> DataNode:
|
|
70
|
+
return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
|
|
71
|
+
|
|
72
|
+
# A transformer maps one DataNode to another
|
|
73
|
+
class Scale(Transformer):
|
|
74
|
+
def __init__(self, factor: float) -> None:
|
|
75
|
+
super().__init__()
|
|
76
|
+
self.factor = factor
|
|
77
|
+
|
|
78
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
79
|
+
return from_array(record.load() * self.factor, record.attributes)
|
|
80
|
+
|
|
81
|
+
result = Scale(2.0).connect(Stress()).execute()
|
|
82
|
+
result.load() # array([2., 4., 6.])
|
|
83
|
+
result.attributes["units"] # "MPa"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Reach for the array only when you need it (`record.load()`); a leaf source can
|
|
87
|
+
hand back a `LazyDataSource`-backed node to defer a heavy read.
|
|
88
|
+
|
|
89
|
+
### Structured-array nodes
|
|
90
|
+
|
|
91
|
+
The `vcti.flow.data.fields` submodule adds nodes that assume a structured
|
|
92
|
+
(named-field) array — building tables, naming/selecting/computing fields,
|
|
93
|
+
merging field groups, and row-keyed iteration:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from vcti.flow.data import ArraySource # lazy leaf source (base binding)
|
|
97
|
+
from vcti.flow.data.fields import (
|
|
98
|
+
RowTableSource, NameFields, SelectFields, ComputeFields,
|
|
99
|
+
RenameFields, DropFields, CastFields, MergeFields, for_each_field,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Build a structured table from dict rows (lazy — rows read on load())
|
|
103
|
+
mats = RowTableSource(lambda: material_rows(reader),
|
|
104
|
+
columns={"id": "MAT_ID", "EX": "Young's Modulus"})
|
|
105
|
+
|
|
106
|
+
# Name plain columns, select/rename, compute
|
|
107
|
+
coords = NameFields(["X", "Y", "Z"]).connect(ArraySource(lambda: reader.coords()))
|
|
108
|
+
picked = SelectFields({"X": "x", "Y": "y"}).connect(coords)
|
|
109
|
+
mag = ComputeFields({"mag": lambda a: np.hypot(a["X"], a["Y"])}).connect(coords)
|
|
110
|
+
|
|
111
|
+
# Rename, drop, cast (rename & drop return views — no data copy)
|
|
112
|
+
renamed = RenameFields({"X": "x"}).connect(coords) # rename, keep the rest
|
|
113
|
+
trimmed = DropFields(["Z"]).connect(coords) # drop, keep the rest
|
|
114
|
+
narrow = CastFields({"X": "f4"}).connect(coords) # change dtypes
|
|
115
|
+
|
|
116
|
+
# Merge field groups (same row count) into one structured array
|
|
117
|
+
combined = MergeFields().connect(ids).connect(coords).execute()
|
|
118
|
+
|
|
119
|
+
# One flow per row, keyed by a field
|
|
120
|
+
for case_id, flow in for_each_field(cases, build_case_flow, key_field="ID"):
|
|
121
|
+
flow.execute()
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## API
|
|
127
|
+
|
|
128
|
+
| Symbol | Purpose |
|
|
129
|
+
|--------|---------|
|
|
130
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` | `vcti.flow` node kinds bound to `DataNode` |
|
|
131
|
+
| `Observer` | `vcti.flow` observer bound to `DataNode` |
|
|
132
|
+
| `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
|
|
133
|
+
| `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array (the lazy counterpart of `from_array`) |
|
|
134
|
+
| `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` for convenience |
|
|
135
|
+
| `fields.RowTableSource` | Lazy leaf source — a structured table from dict rows |
|
|
136
|
+
| `fields.NameFields` / `fields.SelectFields` / `fields.ComputeFields` | Name plain columns, select/rename fields, append/replace computed fields |
|
|
137
|
+
| `fields.RenameFields` / `fields.DropFields` / `fields.CastFields` | Rename or drop fields (views, no copy), or change field dtypes |
|
|
138
|
+
| `fields.MergeFields` | Field-wise merge of structured arrays |
|
|
139
|
+
| `fields.for_each_field` / `fields.field_items` | Row-keyed fan-out over a structured array |
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Dependencies
|
|
144
|
+
|
|
145
|
+
- [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
|
|
146
|
+
- [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
|
|
147
|
+
- [numpy](https://numpy.org/) (>=1.24)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Data Flow
|
|
2
|
+
|
|
3
|
+
The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
[vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
|
|
8
|
+
framework for composing flow graphs — it never inspects the values flowing
|
|
9
|
+
through it. `vcti.flow.data` binds that framework to the
|
|
10
|
+
[vcti-datanode](https://github.com/vcollab/vcti-python-datanode) `DataNode`
|
|
11
|
+
payload (data plus layered attributes behind a data source), so you get
|
|
12
|
+
familiar, ready-bound node kinds — `Source`, `Transformer`, `Reducer`, `Sink` —
|
|
13
|
+
instead of writing `Source[DataNode]` everywhere, plus `from_array` (eager) and
|
|
14
|
+
`ArraySource` (lazy) for building payloads.
|
|
15
|
+
|
|
16
|
+
These node-kind names are the `vcti.flow` kinds bound to `DataNode`: in this
|
|
17
|
+
package `Source` *is* `Source[DataNode]`. It is a flow leaf node — distinct from
|
|
18
|
+
`vcti.datanode.DataSource`, the array-source ABC re-exported here as
|
|
19
|
+
`EagerDataSource` / `LazyDataSource`.
|
|
20
|
+
|
|
21
|
+
Nodes that need a *structured* (named-field) array — field-wise merge and
|
|
22
|
+
row-keyed iteration — live in the `vcti.flow.data.fields` submodule; the base
|
|
23
|
+
binding makes no assumption about array shape.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install vcti-dataflow
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### In `pyproject.toml` dependencies
|
|
32
|
+
|
|
33
|
+
```toml
|
|
34
|
+
dependencies = [
|
|
35
|
+
"vcti-dataflow>=2.0.0",
|
|
36
|
+
]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import numpy as np
|
|
45
|
+
from vcti.flow.data import Source, Transformer, DataNode, from_array
|
|
46
|
+
|
|
47
|
+
# A source produces a DataNode
|
|
48
|
+
class Stress(Source):
|
|
49
|
+
def load(self) -> DataNode:
|
|
50
|
+
return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
|
|
51
|
+
|
|
52
|
+
# A transformer maps one DataNode to another
|
|
53
|
+
class Scale(Transformer):
|
|
54
|
+
def __init__(self, factor: float) -> None:
|
|
55
|
+
super().__init__()
|
|
56
|
+
self.factor = factor
|
|
57
|
+
|
|
58
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
59
|
+
return from_array(record.load() * self.factor, record.attributes)
|
|
60
|
+
|
|
61
|
+
result = Scale(2.0).connect(Stress()).execute()
|
|
62
|
+
result.load() # array([2., 4., 6.])
|
|
63
|
+
result.attributes["units"] # "MPa"
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Reach for the array only when you need it (`record.load()`); a leaf source can
|
|
67
|
+
hand back a `LazyDataSource`-backed node to defer a heavy read.
|
|
68
|
+
|
|
69
|
+
### Structured-array nodes
|
|
70
|
+
|
|
71
|
+
The `vcti.flow.data.fields` submodule adds nodes that assume a structured
|
|
72
|
+
(named-field) array — building tables, naming/selecting/computing fields,
|
|
73
|
+
merging field groups, and row-keyed iteration:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from vcti.flow.data import ArraySource # lazy leaf source (base binding)
|
|
77
|
+
from vcti.flow.data.fields import (
|
|
78
|
+
RowTableSource, NameFields, SelectFields, ComputeFields,
|
|
79
|
+
RenameFields, DropFields, CastFields, MergeFields, for_each_field,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Build a structured table from dict rows (lazy — rows read on load())
|
|
83
|
+
mats = RowTableSource(lambda: material_rows(reader),
|
|
84
|
+
columns={"id": "MAT_ID", "EX": "Young's Modulus"})
|
|
85
|
+
|
|
86
|
+
# Name plain columns, select/rename, compute
|
|
87
|
+
coords = NameFields(["X", "Y", "Z"]).connect(ArraySource(lambda: reader.coords()))
|
|
88
|
+
picked = SelectFields({"X": "x", "Y": "y"}).connect(coords)
|
|
89
|
+
mag = ComputeFields({"mag": lambda a: np.hypot(a["X"], a["Y"])}).connect(coords)
|
|
90
|
+
|
|
91
|
+
# Rename, drop, cast (rename & drop return views — no data copy)
|
|
92
|
+
renamed = RenameFields({"X": "x"}).connect(coords) # rename, keep the rest
|
|
93
|
+
trimmed = DropFields(["Z"]).connect(coords) # drop, keep the rest
|
|
94
|
+
narrow = CastFields({"X": "f4"}).connect(coords) # change dtypes
|
|
95
|
+
|
|
96
|
+
# Merge field groups (same row count) into one structured array
|
|
97
|
+
combined = MergeFields().connect(ids).connect(coords).execute()
|
|
98
|
+
|
|
99
|
+
# One flow per row, keyed by a field
|
|
100
|
+
for case_id, flow in for_each_field(cases, build_case_flow, key_field="ID"):
|
|
101
|
+
flow.execute()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## API
|
|
107
|
+
|
|
108
|
+
| Symbol | Purpose |
|
|
109
|
+
|--------|---------|
|
|
110
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` | `vcti.flow` node kinds bound to `DataNode` |
|
|
111
|
+
| `Observer` | `vcti.flow` observer bound to `DataNode` |
|
|
112
|
+
| `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
|
|
113
|
+
| `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array (the lazy counterpart of `from_array`) |
|
|
114
|
+
| `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` for convenience |
|
|
115
|
+
| `fields.RowTableSource` | Lazy leaf source — a structured table from dict rows |
|
|
116
|
+
| `fields.NameFields` / `fields.SelectFields` / `fields.ComputeFields` | Name plain columns, select/rename fields, append/replace computed fields |
|
|
117
|
+
| `fields.RenameFields` / `fields.DropFields` / `fields.CastFields` | Rename or drop fields (views, no copy), or change field dtypes |
|
|
118
|
+
| `fields.MergeFields` | Field-wise merge of structured arrays |
|
|
119
|
+
| `fields.for_each_field` / `fields.field_items` | Row-keyed fan-out over a structured array |
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Dependencies
|
|
124
|
+
|
|
125
|
+
- [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
|
|
126
|
+
- [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
|
|
127
|
+
- [numpy](https://numpy.org/) (>=1.24)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vcti-dataflow"
|
|
7
|
+
version = "2.0.0"
|
|
8
|
+
description = "The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{name = "Visual Collaboration Technologies Inc."}
|
|
12
|
+
]
|
|
13
|
+
requires-python = ">=3.12,<3.15"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"vcti-flow>=2.0.0",
|
|
16
|
+
"vcti-datanode>=2.0.0",
|
|
17
|
+
"numpy>=1.24",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[tool.setuptools.packages.find]
|
|
21
|
+
where = ["src"]
|
|
22
|
+
include = ["vcti.flow.data", "vcti.flow.data.*"]
|
|
23
|
+
namespaces = true
|
|
24
|
+
|
|
25
|
+
[tool.setuptools.package-data]
|
|
26
|
+
"vcti.flow.data" = ["py.typed"]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
test = ["pytest", "pytest-cov"]
|
|
30
|
+
lint = ["ruff"]
|
|
31
|
+
typecheck = ["mypy"]
|
|
32
|
+
|
|
33
|
+
[tool.setuptools]
|
|
34
|
+
zip-safe = true
|
|
35
|
+
|
|
36
|
+
[tool.pytest.ini_options]
|
|
37
|
+
addopts = "--cov=vcti.flow.data --cov-report=term-missing --cov-fail-under=95"
|
|
38
|
+
|
|
39
|
+
[tool.mypy]
|
|
40
|
+
python_version = "3.12"
|
|
41
|
+
strict = true
|
|
42
|
+
files = ["src"]
|
|
43
|
+
namespace_packages = true
|
|
44
|
+
explicit_package_bases = true
|
|
45
|
+
mypy_path = ["src"]
|
|
46
|
+
|
|
47
|
+
[tool.coverage.run]
|
|
48
|
+
branch = true
|
|
49
|
+
|
|
50
|
+
[tool.coverage.report]
|
|
51
|
+
exclude_also = [
|
|
52
|
+
"raise NotImplementedError",
|
|
53
|
+
"if TYPE_CHECKING:",
|
|
54
|
+
"if __name__ == .__main__.:",
|
|
55
|
+
"@(abc\\.)?abstractmethod",
|
|
56
|
+
"\\.\\.\\.",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[tool.ruff]
|
|
60
|
+
target-version = "py312"
|
|
61
|
+
line-length = 99
|
|
62
|
+
|
|
63
|
+
[tool.ruff.lint]
|
|
64
|
+
select = ["E", "F", "W", "I", "UP"]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""vcti.flow.data — the DataNode binding of vcti.flow.
|
|
4
|
+
|
|
5
|
+
DataNode-bound node kinds (``Source``, ``Transformer``, ``Reducer``, ``Sink``,
|
|
6
|
+
``Observer``) over the generic ``vcti.flow`` framework, plus ``from_array`` (eager)
|
|
7
|
+
and ``ArraySource`` (lazy) for building ``DataNode`` payloads. Structured-array-
|
|
8
|
+
specific nodes (field merge, row iteration, field shaping) live in the
|
|
9
|
+
``vcti.flow.data.fields`` submodule.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from importlib.metadata import version
|
|
13
|
+
|
|
14
|
+
from vcti.datanode import DataNode, EagerDataSource, LazyDataSource
|
|
15
|
+
|
|
16
|
+
from .aliases import (
|
|
17
|
+
Observer,
|
|
18
|
+
Reducer,
|
|
19
|
+
Sink,
|
|
20
|
+
Source,
|
|
21
|
+
Transformer,
|
|
22
|
+
)
|
|
23
|
+
from .record import from_array
|
|
24
|
+
from .sources import ArraySource
|
|
25
|
+
|
|
26
|
+
__version__ = version("vcti-dataflow")
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"ArraySource",
|
|
31
|
+
"DataNode",
|
|
32
|
+
"EagerDataSource",
|
|
33
|
+
"LazyDataSource",
|
|
34
|
+
"Observer",
|
|
35
|
+
"Reducer",
|
|
36
|
+
"Sink",
|
|
37
|
+
"Source",
|
|
38
|
+
"Transformer",
|
|
39
|
+
"from_array",
|
|
40
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""DataNode-bound aliases for the vcti.flow node kinds.
|
|
4
|
+
|
|
5
|
+
These bind the generic ``vcti.flow`` framework to the ``DataNode`` payload, so
|
|
6
|
+
authors subclass ``Source`` / ``Transformer`` / … — the ``vcti.flow.data``
|
|
7
|
+
spelling of ``Source[DataNode]`` / ``Transformer[DataNode, DataNode]`` — instead
|
|
8
|
+
of repeating the type parameter everywhere. They are plain assignments (not PEP
|
|
9
|
+
695 ``type`` aliases) so they remain usable as base classes.
|
|
10
|
+
|
|
11
|
+
The names intentionally shadow the generic ``vcti.flow.core`` kinds: within this
|
|
12
|
+
binding ``Source`` *is* ``Source[DataNode]``. Note this ``Source`` is a flow leaf
|
|
13
|
+
node and is unrelated to ``vcti.datanode.DataSource`` — the array-source ABC
|
|
14
|
+
behind the re-exported ``EagerDataSource`` / ``LazyDataSource``. Dropping the
|
|
15
|
+
``Data`` prefix is what keeps that ``DataSource`` name from clashing here.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import vcti.flow.core as core
|
|
21
|
+
from vcti.datanode import DataNode
|
|
22
|
+
|
|
23
|
+
Source = core.Source[DataNode]
|
|
24
|
+
Transformer = core.Transformer[DataNode, DataNode]
|
|
25
|
+
Reducer = core.Reducer[DataNode, DataNode]
|
|
26
|
+
Sink = core.Sink[DataNode]
|
|
27
|
+
Observer = core.Observer[DataNode]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""vcti.flow.data.fields — nodes for structured-array DataNode payloads.
|
|
4
|
+
|
|
5
|
+
These assume the DataNode's array is a structured (named-field) array. The base
|
|
6
|
+
``vcti.flow.data`` binding makes no such assumption (its ``ArraySource`` /
|
|
7
|
+
``from_array`` build shape-agnostic payloads).
|
|
8
|
+
|
|
9
|
+
- Source: ``RowTableSource`` (table from mapping rows) — lazy.
|
|
10
|
+
- Transformers: ``NameFields`` (name plain columns), ``SelectFields`` (select /
|
|
11
|
+
rename), ``RenameFields`` (rename, keep the rest), ``DropFields`` (drop a
|
|
12
|
+
subset), ``CastFields`` (change dtypes), ``ComputeFields`` (append / replace
|
|
13
|
+
computed fields).
|
|
14
|
+
- Reduce: ``MergeFields`` (field-wise merge).
|
|
15
|
+
- Iterate: ``for_each_field`` / ``field_items`` (row-keyed fan-out).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from .iterate import field_items, for_each_field
|
|
19
|
+
from .merge import MergeFields
|
|
20
|
+
from .sources import RowTableSource
|
|
21
|
+
from .transforms import (
|
|
22
|
+
CastFields,
|
|
23
|
+
ComputeFields,
|
|
24
|
+
DropFields,
|
|
25
|
+
NameFields,
|
|
26
|
+
RenameFields,
|
|
27
|
+
SelectFields,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"CastFields",
|
|
32
|
+
"ComputeFields",
|
|
33
|
+
"DropFields",
|
|
34
|
+
"MergeFields",
|
|
35
|
+
"NameFields",
|
|
36
|
+
"RenameFields",
|
|
37
|
+
"RowTableSource",
|
|
38
|
+
"SelectFields",
|
|
39
|
+
"field_items",
|
|
40
|
+
"for_each_field",
|
|
41
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Row iteration over a structured-array DataNode, as a flow combinator."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from vcti.datanode import DataNode
|
|
11
|
+
from vcti.flow.core import Node, for_each
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def field_items(key_field: str = "ID") -> Callable[[DataNode], Iterable[Any]]:
|
|
15
|
+
"""Return an items-extractor that reads *key_field* from each row.
|
|
16
|
+
|
|
17
|
+
Suitable as the ``items`` argument to ``vcti.flow.core.for_each``. The
|
|
18
|
+
extractor loads the node's array and yields the value of *key_field* per row;
|
|
19
|
+
an absent or empty array yields nothing.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If the array is structured and lacks *key_field*.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def extract(record: DataNode) -> list[Any]:
|
|
26
|
+
arr = record.load()
|
|
27
|
+
if arr is None or arr.shape[0] == 0:
|
|
28
|
+
return []
|
|
29
|
+
if arr.dtype.names is not None and key_field not in arr.dtype.names:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Key field {key_field!r} not found in source fields {arr.dtype.names}."
|
|
32
|
+
)
|
|
33
|
+
return [row[key_field] for row in arr]
|
|
34
|
+
|
|
35
|
+
return extract
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def for_each_field[U](
|
|
39
|
+
source: Node[DataNode],
|
|
40
|
+
factory: Callable[[Any], Node[U]],
|
|
41
|
+
key_field: str = "ID",
|
|
42
|
+
) -> Iterator[tuple[Any, Node[U]]]:
|
|
43
|
+
"""Fan a keys DataNode out into one flow per row, keyed by *key_field*.
|
|
44
|
+
|
|
45
|
+
A DataNode-specific convenience over ``vcti.flow.core.for_each``: it yields
|
|
46
|
+
``(key, flow)`` pairs, one per row of the source's structured array.
|
|
47
|
+
"""
|
|
48
|
+
return for_each(source, field_items(key_field), factory)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""MergeFields — merge structured-array DataNodes field-wise."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from numpy.lib import recfunctions as rfn
|
|
10
|
+
from vcti.datanode import DataNode
|
|
11
|
+
|
|
12
|
+
from ..aliases import Reducer
|
|
13
|
+
from ..record import from_array
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MergeFields(Reducer):
|
|
17
|
+
"""Merge multiple DataNodes into one by combining fields horizontally.
|
|
18
|
+
|
|
19
|
+
Arrays are merged field-wise with ``numpy.lib.recfunctions.merge_arrays``.
|
|
20
|
+
All inputs must have the same number of rows. Attributes merge with
|
|
21
|
+
**last-wins** semantics. Inputs with no data are skipped; if no input has
|
|
22
|
+
data, an empty (metadata-only) DataNode is returned.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
ValueError: If the input arrays have different row counts.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def reduce(self, records: list[DataNode]) -> DataNode:
|
|
29
|
+
loaded = [(d, arr) for d in records if d.has_data and (arr := d.load()) is not None]
|
|
30
|
+
if not loaded:
|
|
31
|
+
return from_array()
|
|
32
|
+
|
|
33
|
+
arrays = [arr for _, arr in loaded]
|
|
34
|
+
if len(arrays) > 1:
|
|
35
|
+
first_length = len(arrays[0])
|
|
36
|
+
for i, arr in enumerate(arrays[1:], start=1):
|
|
37
|
+
if len(arr) != first_length:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Cannot merge arrays with different lengths. Array 0 has "
|
|
40
|
+
f"{first_length} rows, but array {i} has {len(arr)} rows."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if len(arrays) == 1:
|
|
44
|
+
merged = arrays[0]
|
|
45
|
+
else:
|
|
46
|
+
merged = rfn.merge_arrays(arrays, flatten=True, usemask=False)
|
|
47
|
+
|
|
48
|
+
combined: dict[str, Any] = {}
|
|
49
|
+
for d, _ in loaded:
|
|
50
|
+
if d.attributes:
|
|
51
|
+
combined.update(d.attributes)
|
|
52
|
+
|
|
53
|
+
return from_array(merged, combined or None)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Leaf source that builds a structured-array DataNode from mapping rows (lazily)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from vcti.datanode import DataNode, LazyDataSource
|
|
12
|
+
|
|
13
|
+
from ..aliases import Source
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _column_from_rows(rows: Sequence[Mapping[str, Any]], src_key: str) -> np.ndarray:
|
|
17
|
+
"""Build one structured-array column from a sequence of mapping rows.
|
|
18
|
+
|
|
19
|
+
Gathers ``row[src_key]`` across rows (absent / ``None`` = missing) and picks
|
|
20
|
+
a dtype from the present scalar values: all-int → ``i8`` (missing 0),
|
|
21
|
+
all-str/bytes → ``U<maxlen>`` (missing ""), otherwise ``f8`` (missing NaN,
|
|
22
|
+
non-numeric coerced to NaN).
|
|
23
|
+
"""
|
|
24
|
+
n = len(rows)
|
|
25
|
+
present = [(i, r[src_key]) for i, r in enumerate(rows) if r.get(src_key) is not None]
|
|
26
|
+
values = [v for _, v in present]
|
|
27
|
+
|
|
28
|
+
def _is_int(v: Any) -> bool:
|
|
29
|
+
return isinstance(v, (int, np.integer)) and not isinstance(v, bool)
|
|
30
|
+
|
|
31
|
+
def _is_str(v: Any) -> bool:
|
|
32
|
+
return isinstance(v, (str, bytes, np.bytes_, np.str_))
|
|
33
|
+
|
|
34
|
+
if values and all(_is_int(v) for v in values):
|
|
35
|
+
int_col = np.zeros(n, dtype="i8")
|
|
36
|
+
for i, v in present:
|
|
37
|
+
int_col[i] = int(v)
|
|
38
|
+
return int_col
|
|
39
|
+
|
|
40
|
+
if values and all(_is_str(v) for v in values):
|
|
41
|
+
decoded = [
|
|
42
|
+
v.decode("utf-8", "replace").rstrip("\x00")
|
|
43
|
+
if isinstance(v, (bytes, np.bytes_))
|
|
44
|
+
else str(v)
|
|
45
|
+
for v in values
|
|
46
|
+
]
|
|
47
|
+
width = max((len(s) for s in decoded), default=1) or 1
|
|
48
|
+
str_col = np.zeros(n, dtype=f"U{width}")
|
|
49
|
+
for (i, _), s in zip(present, decoded):
|
|
50
|
+
str_col[i] = s
|
|
51
|
+
return str_col
|
|
52
|
+
|
|
53
|
+
float_col = np.full(n, np.nan, dtype="f8")
|
|
54
|
+
for i, v in present:
|
|
55
|
+
try:
|
|
56
|
+
float_col[i] = float(v)
|
|
57
|
+
except (TypeError, ValueError):
|
|
58
|
+
pass # non-scalar / non-numeric stays NaN
|
|
59
|
+
return float_col
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class RowTableSource(Source):
|
|
63
|
+
"""Build a structured-array DataNode from heterogeneous mapping rows.
|
|
64
|
+
|
|
65
|
+
``columns`` selects and renames source keys into output fields; each
|
|
66
|
+
column's dtype is inferred from its present values, with missing entries
|
|
67
|
+
filled (0 / "" / NaN). ``rows_fn`` runs lazily — only when the resulting
|
|
68
|
+
node is loaded, not at ``execute()`` time.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
rows_fn: Callable[[], Sequence[Mapping[str, Any]]],
|
|
74
|
+
columns: list[str] | dict[str, str],
|
|
75
|
+
*,
|
|
76
|
+
name: str | None = None,
|
|
77
|
+
attributes: Mapping[str, Any] | None = None,
|
|
78
|
+
) -> None:
|
|
79
|
+
super().__init__(name=name)
|
|
80
|
+
self._rows_fn = rows_fn
|
|
81
|
+
self._mapping = dict(columns) if isinstance(columns, dict) else {c: c for c in columns}
|
|
82
|
+
self._attributes = attributes
|
|
83
|
+
|
|
84
|
+
def load(self) -> DataNode:
|
|
85
|
+
def build() -> np.ndarray:
|
|
86
|
+
rows = list(self._rows_fn())
|
|
87
|
+
cols = {out: _column_from_rows(rows, src) for src, out in self._mapping.items()}
|
|
88
|
+
dtype = np.dtype([(out, cols[out].dtype) for out in self._mapping.values()])
|
|
89
|
+
table = np.empty(len(rows), dtype=dtype)
|
|
90
|
+
for out in self._mapping.values():
|
|
91
|
+
table[out] = cols[out]
|
|
92
|
+
return table
|
|
93
|
+
|
|
94
|
+
return DataNode(data_source=LazyDataSource(build), source_attributes=self._attributes)
|