vcti-dataflow 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vcti/flow/data/__init__.py +40 -0
- vcti/flow/data/aliases.py +27 -0
- vcti/flow/data/fields/__init__.py +41 -0
- vcti/flow/data/fields/iterate.py +48 -0
- vcti/flow/data/fields/merge.py +53 -0
- vcti/flow/data/fields/sources.py +94 -0
- vcti/flow/data/fields/transforms.py +203 -0
- vcti/flow/data/py.typed +0 -0
- vcti/flow/data/record.py +34 -0
- vcti/flow/data/sources.py +40 -0
- vcti_dataflow-2.0.0.dist-info/METADATA +147 -0
- vcti_dataflow-2.0.0.dist-info/RECORD +16 -0
- vcti_dataflow-2.0.0.dist-info/WHEEL +5 -0
- vcti_dataflow-2.0.0.dist-info/licenses/LICENSE +8 -0
- vcti_dataflow-2.0.0.dist-info/top_level.txt +1 -0
- vcti_dataflow-2.0.0.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""vcti.flow.data — the DataNode binding of vcti.flow.
|
|
4
|
+
|
|
5
|
+
DataNode-bound node kinds (``Source``, ``Transformer``, ``Reducer``, ``Sink``,
|
|
6
|
+
``Observer``) over the generic ``vcti.flow`` framework, plus ``from_array`` (eager)
|
|
7
|
+
and ``ArraySource`` (lazy) for building ``DataNode`` payloads. Structured-array-
|
|
8
|
+
specific nodes (field merge, row iteration, field shaping) live in the
|
|
9
|
+
``vcti.flow.data.fields`` submodule.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from importlib.metadata import version
|
|
13
|
+
|
|
14
|
+
from vcti.datanode import DataNode, EagerDataSource, LazyDataSource
|
|
15
|
+
|
|
16
|
+
from .aliases import (
|
|
17
|
+
Observer,
|
|
18
|
+
Reducer,
|
|
19
|
+
Sink,
|
|
20
|
+
Source,
|
|
21
|
+
Transformer,
|
|
22
|
+
)
|
|
23
|
+
from .record import from_array
|
|
24
|
+
from .sources import ArraySource
|
|
25
|
+
|
|
26
|
+
__version__ = version("vcti-dataflow")
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"ArraySource",
|
|
31
|
+
"DataNode",
|
|
32
|
+
"EagerDataSource",
|
|
33
|
+
"LazyDataSource",
|
|
34
|
+
"Observer",
|
|
35
|
+
"Reducer",
|
|
36
|
+
"Sink",
|
|
37
|
+
"Source",
|
|
38
|
+
"Transformer",
|
|
39
|
+
"from_array",
|
|
40
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""DataNode-bound aliases for the vcti.flow node kinds.
|
|
4
|
+
|
|
5
|
+
These bind the generic ``vcti.flow`` framework to the ``DataNode`` payload, so
|
|
6
|
+
authors subclass ``Source`` / ``Transformer`` / … — the ``vcti.flow.data``
|
|
7
|
+
spelling of ``Source[DataNode]`` / ``Transformer[DataNode, DataNode]`` — instead
|
|
8
|
+
of repeating the type parameter everywhere. They are plain assignments (not PEP
|
|
9
|
+
695 ``type`` aliases) so they remain usable as base classes.
|
|
10
|
+
|
|
11
|
+
The names intentionally shadow the generic ``vcti.flow.core`` kinds: within this
|
|
12
|
+
binding ``Source`` *is* ``Source[DataNode]``. Note this ``Source`` is a flow leaf
|
|
13
|
+
node and is unrelated to ``vcti.datanode.DataSource`` — the array-source ABC
|
|
14
|
+
behind the re-exported ``EagerDataSource`` / ``LazyDataSource``. Dropping the
|
|
15
|
+
``Data`` prefix is what keeps that ``DataSource`` name from clashing here.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import vcti.flow.core as core
|
|
21
|
+
from vcti.datanode import DataNode
|
|
22
|
+
|
|
23
|
+
Source = core.Source[DataNode]
|
|
24
|
+
Transformer = core.Transformer[DataNode, DataNode]
|
|
25
|
+
Reducer = core.Reducer[DataNode, DataNode]
|
|
26
|
+
Sink = core.Sink[DataNode]
|
|
27
|
+
Observer = core.Observer[DataNode]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""vcti.flow.data.fields — nodes for structured-array DataNode payloads.
|
|
4
|
+
|
|
5
|
+
These assume the DataNode's array is a structured (named-field) array. The base
|
|
6
|
+
``vcti.flow.data`` binding makes no such assumption (its ``ArraySource`` /
|
|
7
|
+
``from_array`` build shape-agnostic payloads).
|
|
8
|
+
|
|
9
|
+
- Source: ``RowTableSource`` (table from mapping rows) — lazy.
|
|
10
|
+
- Transformers: ``NameFields`` (name plain columns), ``SelectFields`` (select /
|
|
11
|
+
rename), ``RenameFields`` (rename, keep the rest), ``DropFields`` (drop a
|
|
12
|
+
subset), ``CastFields`` (change dtypes), ``ComputeFields`` (append / replace
|
|
13
|
+
computed fields).
|
|
14
|
+
- Reduce: ``MergeFields`` (field-wise merge).
|
|
15
|
+
- Iterate: ``for_each_field`` / ``field_items`` (row-keyed fan-out).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from .iterate import field_items, for_each_field
|
|
19
|
+
from .merge import MergeFields
|
|
20
|
+
from .sources import RowTableSource
|
|
21
|
+
from .transforms import (
|
|
22
|
+
CastFields,
|
|
23
|
+
ComputeFields,
|
|
24
|
+
DropFields,
|
|
25
|
+
NameFields,
|
|
26
|
+
RenameFields,
|
|
27
|
+
SelectFields,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"CastFields",
|
|
32
|
+
"ComputeFields",
|
|
33
|
+
"DropFields",
|
|
34
|
+
"MergeFields",
|
|
35
|
+
"NameFields",
|
|
36
|
+
"RenameFields",
|
|
37
|
+
"RowTableSource",
|
|
38
|
+
"SelectFields",
|
|
39
|
+
"field_items",
|
|
40
|
+
"for_each_field",
|
|
41
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Row iteration over a structured-array DataNode, as a flow combinator."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from vcti.datanode import DataNode
|
|
11
|
+
from vcti.flow.core import Node, for_each
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def field_items(key_field: str = "ID") -> Callable[[DataNode], Iterable[Any]]:
|
|
15
|
+
"""Return an items-extractor that reads *key_field* from each row.
|
|
16
|
+
|
|
17
|
+
Suitable as the ``items`` argument to ``vcti.flow.core.for_each``. The
|
|
18
|
+
extractor loads the node's array and yields the value of *key_field* per row;
|
|
19
|
+
an absent or empty array yields nothing.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If the array is structured and lacks *key_field*.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def extract(record: DataNode) -> list[Any]:
|
|
26
|
+
arr = record.load()
|
|
27
|
+
if arr is None or arr.shape[0] == 0:
|
|
28
|
+
return []
|
|
29
|
+
if arr.dtype.names is not None and key_field not in arr.dtype.names:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Key field {key_field!r} not found in source fields {arr.dtype.names}."
|
|
32
|
+
)
|
|
33
|
+
return [row[key_field] for row in arr]
|
|
34
|
+
|
|
35
|
+
return extract
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def for_each_field[U](
|
|
39
|
+
source: Node[DataNode],
|
|
40
|
+
factory: Callable[[Any], Node[U]],
|
|
41
|
+
key_field: str = "ID",
|
|
42
|
+
) -> Iterator[tuple[Any, Node[U]]]:
|
|
43
|
+
"""Fan a keys DataNode out into one flow per row, keyed by *key_field*.
|
|
44
|
+
|
|
45
|
+
A DataNode-specific convenience over ``vcti.flow.core.for_each``: it yields
|
|
46
|
+
``(key, flow)`` pairs, one per row of the source's structured array.
|
|
47
|
+
"""
|
|
48
|
+
return for_each(source, field_items(key_field), factory)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""MergeFields — merge structured-array DataNodes field-wise."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from numpy.lib import recfunctions as rfn
|
|
10
|
+
from vcti.datanode import DataNode
|
|
11
|
+
|
|
12
|
+
from ..aliases import Reducer
|
|
13
|
+
from ..record import from_array
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MergeFields(Reducer):
|
|
17
|
+
"""Merge multiple DataNodes into one by combining fields horizontally.
|
|
18
|
+
|
|
19
|
+
Arrays are merged field-wise with ``numpy.lib.recfunctions.merge_arrays``.
|
|
20
|
+
All inputs must have the same number of rows. Attributes merge with
|
|
21
|
+
**last-wins** semantics. Inputs with no data are skipped; if no input has
|
|
22
|
+
data, an empty (metadata-only) DataNode is returned.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
ValueError: If the input arrays have different row counts.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def reduce(self, records: list[DataNode]) -> DataNode:
|
|
29
|
+
loaded = [(d, arr) for d in records if d.has_data and (arr := d.load()) is not None]
|
|
30
|
+
if not loaded:
|
|
31
|
+
return from_array()
|
|
32
|
+
|
|
33
|
+
arrays = [arr for _, arr in loaded]
|
|
34
|
+
if len(arrays) > 1:
|
|
35
|
+
first_length = len(arrays[0])
|
|
36
|
+
for i, arr in enumerate(arrays[1:], start=1):
|
|
37
|
+
if len(arr) != first_length:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Cannot merge arrays with different lengths. Array 0 has "
|
|
40
|
+
f"{first_length} rows, but array {i} has {len(arr)} rows."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if len(arrays) == 1:
|
|
44
|
+
merged = arrays[0]
|
|
45
|
+
else:
|
|
46
|
+
merged = rfn.merge_arrays(arrays, flatten=True, usemask=False)
|
|
47
|
+
|
|
48
|
+
combined: dict[str, Any] = {}
|
|
49
|
+
for d, _ in loaded:
|
|
50
|
+
if d.attributes:
|
|
51
|
+
combined.update(d.attributes)
|
|
52
|
+
|
|
53
|
+
return from_array(merged, combined or None)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Leaf source that builds a structured-array DataNode from mapping rows (lazily)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from vcti.datanode import DataNode, LazyDataSource
|
|
12
|
+
|
|
13
|
+
from ..aliases import Source
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _column_from_rows(rows: Sequence[Mapping[str, Any]], src_key: str) -> np.ndarray:
|
|
17
|
+
"""Build one structured-array column from a sequence of mapping rows.
|
|
18
|
+
|
|
19
|
+
Gathers ``row[src_key]`` across rows (absent / ``None`` = missing) and picks
|
|
20
|
+
a dtype from the present scalar values: all-int → ``i8`` (missing 0),
|
|
21
|
+
all-str/bytes → ``U<maxlen>`` (missing ""), otherwise ``f8`` (missing NaN,
|
|
22
|
+
non-numeric coerced to NaN).
|
|
23
|
+
"""
|
|
24
|
+
n = len(rows)
|
|
25
|
+
present = [(i, r[src_key]) for i, r in enumerate(rows) if r.get(src_key) is not None]
|
|
26
|
+
values = [v for _, v in present]
|
|
27
|
+
|
|
28
|
+
def _is_int(v: Any) -> bool:
|
|
29
|
+
return isinstance(v, (int, np.integer)) and not isinstance(v, bool)
|
|
30
|
+
|
|
31
|
+
def _is_str(v: Any) -> bool:
|
|
32
|
+
return isinstance(v, (str, bytes, np.bytes_, np.str_))
|
|
33
|
+
|
|
34
|
+
if values and all(_is_int(v) for v in values):
|
|
35
|
+
int_col = np.zeros(n, dtype="i8")
|
|
36
|
+
for i, v in present:
|
|
37
|
+
int_col[i] = int(v)
|
|
38
|
+
return int_col
|
|
39
|
+
|
|
40
|
+
if values and all(_is_str(v) for v in values):
|
|
41
|
+
decoded = [
|
|
42
|
+
v.decode("utf-8", "replace").rstrip("\x00")
|
|
43
|
+
if isinstance(v, (bytes, np.bytes_))
|
|
44
|
+
else str(v)
|
|
45
|
+
for v in values
|
|
46
|
+
]
|
|
47
|
+
width = max((len(s) for s in decoded), default=1) or 1
|
|
48
|
+
str_col = np.zeros(n, dtype=f"U{width}")
|
|
49
|
+
for (i, _), s in zip(present, decoded):
|
|
50
|
+
str_col[i] = s
|
|
51
|
+
return str_col
|
|
52
|
+
|
|
53
|
+
float_col = np.full(n, np.nan, dtype="f8")
|
|
54
|
+
for i, v in present:
|
|
55
|
+
try:
|
|
56
|
+
float_col[i] = float(v)
|
|
57
|
+
except (TypeError, ValueError):
|
|
58
|
+
pass # non-scalar / non-numeric stays NaN
|
|
59
|
+
return float_col
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class RowTableSource(Source):
|
|
63
|
+
"""Build a structured-array DataNode from heterogeneous mapping rows.
|
|
64
|
+
|
|
65
|
+
``columns`` selects and renames source keys into output fields; each
|
|
66
|
+
column's dtype is inferred from its present values, with missing entries
|
|
67
|
+
filled (0 / "" / NaN). ``rows_fn`` runs lazily — only when the resulting
|
|
68
|
+
node is loaded, not at ``execute()`` time.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
rows_fn: Callable[[], Sequence[Mapping[str, Any]]],
|
|
74
|
+
columns: list[str] | dict[str, str],
|
|
75
|
+
*,
|
|
76
|
+
name: str | None = None,
|
|
77
|
+
attributes: Mapping[str, Any] | None = None,
|
|
78
|
+
) -> None:
|
|
79
|
+
super().__init__(name=name)
|
|
80
|
+
self._rows_fn = rows_fn
|
|
81
|
+
self._mapping = dict(columns) if isinstance(columns, dict) else {c: c for c in columns}
|
|
82
|
+
self._attributes = attributes
|
|
83
|
+
|
|
84
|
+
def load(self) -> DataNode:
|
|
85
|
+
def build() -> np.ndarray:
|
|
86
|
+
rows = list(self._rows_fn())
|
|
87
|
+
cols = {out: _column_from_rows(rows, src) for src, out in self._mapping.items()}
|
|
88
|
+
dtype = np.dtype([(out, cols[out].dtype) for out in self._mapping.values()])
|
|
89
|
+
table = np.empty(len(rows), dtype=dtype)
|
|
90
|
+
for out in self._mapping.values():
|
|
91
|
+
table[out] = cols[out]
|
|
92
|
+
return table
|
|
93
|
+
|
|
94
|
+
return DataNode(data_source=LazyDataSource(build), source_attributes=self._attributes)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Transformers that reshape structured-array DataNodes (eager)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from numpy.lib import recfunctions as rfn
|
|
12
|
+
from vcti.datanode import DataNode
|
|
13
|
+
|
|
14
|
+
from ..aliases import Transformer
|
|
15
|
+
from ..record import from_array
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NameFields(Transformer):
|
|
19
|
+
"""Name the columns of a plain array, producing a structured field group.
|
|
20
|
+
|
|
21
|
+
A 1-D array becomes a single named field; a 2-D ``(N, K)`` array becomes K
|
|
22
|
+
named fields (e.g. ``["X", "Y", "Z"]`` for coordinates).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, names: str | list[str], *, name: str | None = None) -> None:
|
|
26
|
+
super().__init__(name=name)
|
|
27
|
+
self._names = [names] if isinstance(names, str) else list(names)
|
|
28
|
+
|
|
29
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
30
|
+
arr = record.load()
|
|
31
|
+
if arr is None:
|
|
32
|
+
raise ValueError(f"{self.name}: input record has no data")
|
|
33
|
+
if arr.dtype.names is not None:
|
|
34
|
+
raise ValueError(f"{self.name}: expected a plain array, got a structured one")
|
|
35
|
+
cols = arr.reshape(len(arr), -1)
|
|
36
|
+
if cols.shape[1] != len(self._names):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"{self.name}: array has {cols.shape[1]} column(s) "
|
|
39
|
+
f"but {len(self._names)} name(s) were given"
|
|
40
|
+
)
|
|
41
|
+
out_dtype = np.dtype([(n, arr.dtype) for n in self._names])
|
|
42
|
+
out = np.empty(len(arr), dtype=out_dtype)
|
|
43
|
+
for i, field in enumerate(self._names):
|
|
44
|
+
out[field] = cols[:, i]
|
|
45
|
+
return from_array(out, record.attributes)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SelectFields(Transformer):
|
|
49
|
+
"""Select and optionally rename fields of a structured field group.
|
|
50
|
+
|
|
51
|
+
Pass a list to select by name, or a dict ``{source: output}`` to rename
|
|
52
|
+
while selecting.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, fields: list[str] | dict[str, str], *, name: str | None = None) -> None:
|
|
56
|
+
super().__init__(name=name)
|
|
57
|
+
self._mapping = dict(fields) if isinstance(fields, dict) else {f: f for f in fields}
|
|
58
|
+
|
|
59
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
60
|
+
arr = record.load()
|
|
61
|
+
if arr is None:
|
|
62
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
63
|
+
fields = arr.dtype.fields
|
|
64
|
+
if fields is None:
|
|
65
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
66
|
+
missing = [s for s in self._mapping if s not in fields]
|
|
67
|
+
if missing:
|
|
68
|
+
raise ValueError(f"{self.name}: unknown field(s) {missing}; available: {list(fields)}")
|
|
69
|
+
out_dtype = np.dtype([(out, fields[src][0]) for src, out in self._mapping.items()])
|
|
70
|
+
out = np.empty(arr.shape, dtype=out_dtype)
|
|
71
|
+
for src, dst in self._mapping.items():
|
|
72
|
+
out[dst] = arr[src]
|
|
73
|
+
return from_array(out, record.attributes)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ComputeFields(Transformer):
|
|
77
|
+
"""Append (or replace) one or more computed fields in a structured field group.
|
|
78
|
+
|
|
79
|
+
Pass ``{name: fn}``; each ``fn`` receives the structured array and returns a
|
|
80
|
+
1-D array of the same length. Fields are computed in iteration order against
|
|
81
|
+
the running result, so a later field may reference an earlier one; a name that
|
|
82
|
+
already exists is replaced. Example::
|
|
83
|
+
|
|
84
|
+
ComputeFields({"vmag": lambda a: np.sqrt(a["UX"] ** 2 + a["UY"] ** 2)})
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
fields: dict[str, Callable[[np.ndarray], Any]],
|
|
90
|
+
*,
|
|
91
|
+
name: str | None = None,
|
|
92
|
+
) -> None:
|
|
93
|
+
super().__init__(name=name)
|
|
94
|
+
self._fields = dict(fields)
|
|
95
|
+
|
|
96
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
97
|
+
arr = record.load()
|
|
98
|
+
if arr is None or arr.dtype.names is None:
|
|
99
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
100
|
+
n = len(arr)
|
|
101
|
+
out = arr
|
|
102
|
+
for field_name, fn in self._fields.items():
|
|
103
|
+
values = np.asarray(fn(out))
|
|
104
|
+
if len(values) != n:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"{self.name}: computed field {field_name!r} has {len(values)} "
|
|
107
|
+
f"row(s), expected {n}"
|
|
108
|
+
)
|
|
109
|
+
if field_name in (out.dtype.names or ()):
|
|
110
|
+
out = out.copy()
|
|
111
|
+
out[field_name] = values
|
|
112
|
+
else:
|
|
113
|
+
out = rfn.append_fields(out, field_name, values, usemask=False)
|
|
114
|
+
return from_array(out, record.attributes)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class RenameFields(Transformer):
|
|
118
|
+
"""Rename fields of a structured field group, keeping the rest.
|
|
119
|
+
|
|
120
|
+
Pass ``{old: new}``; unlisted fields pass through unchanged and in order.
|
|
121
|
+
Renaming only relabels the dtype, so no array data is copied — the result
|
|
122
|
+
shares memory with the input.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, names: dict[str, str], *, name: str | None = None) -> None:
|
|
126
|
+
super().__init__(name=name)
|
|
127
|
+
self._mapping = dict(names)
|
|
128
|
+
|
|
129
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
130
|
+
arr = record.load()
|
|
131
|
+
if arr is None:
|
|
132
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
133
|
+
names = arr.dtype.names
|
|
134
|
+
if names is None:
|
|
135
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
136
|
+
unknown = [s for s in self._mapping if s not in names]
|
|
137
|
+
if unknown:
|
|
138
|
+
raise ValueError(f"{self.name}: unknown field(s) {unknown}; available: {list(names)}")
|
|
139
|
+
renamed = [self._mapping.get(n, n) for n in names]
|
|
140
|
+
if len(set(renamed)) != len(renamed):
|
|
141
|
+
raise ValueError(f"{self.name}: rename produces duplicate field name(s) in {renamed}")
|
|
142
|
+
return from_array(rfn.rename_fields(arr, self._mapping), record.attributes)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class DropFields(Transformer):
|
|
146
|
+
"""Drop fields from a structured field group, keeping the rest.
|
|
147
|
+
|
|
148
|
+
The remaining fields are returned as a view over the input array (no data
|
|
149
|
+
copy). The inverse of ``SelectFields`` — ergonomic when you keep most fields.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
ValueError: If a named field is absent, or if every field would be dropped.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
def __init__(self, fields: list[str], *, name: str | None = None) -> None:
|
|
156
|
+
super().__init__(name=name)
|
|
157
|
+
self._drop = list(fields)
|
|
158
|
+
|
|
159
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
160
|
+
arr = record.load()
|
|
161
|
+
if arr is None:
|
|
162
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
163
|
+
names = arr.dtype.names
|
|
164
|
+
if names is None:
|
|
165
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
166
|
+
unknown = [f for f in self._drop if f not in names]
|
|
167
|
+
if unknown:
|
|
168
|
+
raise ValueError(f"{self.name}: unknown field(s) {unknown}; available: {list(names)}")
|
|
169
|
+
drop = set(self._drop)
|
|
170
|
+
kept = [n for n in names if n not in drop]
|
|
171
|
+
if not kept:
|
|
172
|
+
raise ValueError(f"{self.name}: dropping all fields leaves nothing")
|
|
173
|
+
return from_array(arr[kept], record.attributes)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class CastFields(Transformer):
|
|
177
|
+
"""Cast the dtypes of selected fields in a structured field group.
|
|
178
|
+
|
|
179
|
+
Pass ``{field: dtype}``; unlisted fields keep their dtype. A structured array
|
|
180
|
+
is contiguous in memory, so casting rebuilds it — this is the one field op
|
|
181
|
+
that copies. Narrowing casts (e.g. ``f8`` -> ``f4``) lose precision by design.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
def __init__(self, dtypes: dict[str, Any], *, name: str | None = None) -> None:
|
|
185
|
+
super().__init__(name=name)
|
|
186
|
+
self._dtypes = {field: np.dtype(dt) for field, dt in dtypes.items()}
|
|
187
|
+
|
|
188
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
189
|
+
arr = record.load()
|
|
190
|
+
if arr is None:
|
|
191
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
192
|
+
names = arr.dtype.names
|
|
193
|
+
fields = arr.dtype.fields
|
|
194
|
+
if names is None or fields is None:
|
|
195
|
+
raise ValueError(f"{self.name}: requires a structured field group")
|
|
196
|
+
unknown = [f for f in self._dtypes if f not in names]
|
|
197
|
+
if unknown:
|
|
198
|
+
raise ValueError(f"{self.name}: unknown field(s) {unknown}; available: {list(names)}")
|
|
199
|
+
out_dtype = np.dtype([(n, self._dtypes.get(n, fields[n][0])) for n in names])
|
|
200
|
+
out = np.empty(arr.shape, dtype=out_dtype)
|
|
201
|
+
for n in names:
|
|
202
|
+
out[n] = arr[n]
|
|
203
|
+
return from_array(out, record.attributes)
|
vcti/flow/data/py.typed
ADDED
|
File without changes
|
vcti/flow/data/record.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Build DataNode payloads for flow graphs from in-memory arrays."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from vcti.datanode import DataNode, EagerDataSource
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def from_array(
|
|
15
|
+
array: np.ndarray | None = None,
|
|
16
|
+
attributes: Mapping[str, Any] | None = None,
|
|
17
|
+
) -> DataNode:
|
|
18
|
+
"""Build a DataNode wrapping an in-memory array.
|
|
19
|
+
|
|
20
|
+
The array (when given) is held by an ``EagerDataSource`` — it is already
|
|
21
|
+
resident, so deferring it would be pointless. Attributes go in the enriched
|
|
22
|
+
layer (the convention for derived metadata); a leaf source reading external
|
|
23
|
+
data uses ``source_attributes`` directly instead. When *array* is ``None``
|
|
24
|
+
the node has no data source — a metadata-only node.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
array: The array to wrap, or ``None`` for a metadata-only node.
|
|
28
|
+
attributes: Optional metadata, stored in ``enriched_attributes``.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A ``DataNode`` with an eager data source, or none when *array* is ``None``.
|
|
32
|
+
"""
|
|
33
|
+
source = EagerDataSource(array) if array is not None else None
|
|
34
|
+
return DataNode(data_source=source, enriched_attributes=attributes)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Lazy leaf source over a callable that returns an array."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Mapping
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from vcti.datanode import DataNode, LazyDataSource
|
|
12
|
+
|
|
13
|
+
from .aliases import Source
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ArraySource(Source):
|
|
17
|
+
"""Expose a callable that returns an array as a lazy DataNode source.
|
|
18
|
+
|
|
19
|
+
``load_fn`` returns a numpy array and runs only when the node is loaded, so a
|
|
20
|
+
heavy read (file, service, reader accessor) is deferred past ``execute()``.
|
|
21
|
+
This is the lazy counterpart of :func:`from_array` (which is eager); it makes
|
|
22
|
+
no assumption about array shape.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
load_fn: Callable[[], np.ndarray],
|
|
28
|
+
*,
|
|
29
|
+
name: str | None = None,
|
|
30
|
+
attributes: Mapping[str, Any] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
super().__init__(name=name)
|
|
33
|
+
self._load_fn = load_fn
|
|
34
|
+
self._attributes = attributes
|
|
35
|
+
|
|
36
|
+
def load(self) -> DataNode:
|
|
37
|
+
return DataNode(
|
|
38
|
+
data_source=LazyDataSource(self._load_fn),
|
|
39
|
+
source_attributes=self._attributes,
|
|
40
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vcti-dataflow
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads.
|
|
5
|
+
Author: Visual Collaboration Technologies Inc.
|
|
6
|
+
Requires-Python: <3.15,>=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: vcti-flow>=2.0.0
|
|
10
|
+
Requires-Dist: vcti-datanode>=2.0.0
|
|
11
|
+
Requires-Dist: numpy>=1.24
|
|
12
|
+
Provides-Extra: test
|
|
13
|
+
Requires-Dist: pytest; extra == "test"
|
|
14
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
15
|
+
Provides-Extra: lint
|
|
16
|
+
Requires-Dist: ruff; extra == "lint"
|
|
17
|
+
Provides-Extra: typecheck
|
|
18
|
+
Requires-Dist: mypy; extra == "typecheck"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# Data Flow
|
|
22
|
+
|
|
23
|
+
The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads.
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
[vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
|
|
28
|
+
framework for composing flow graphs — it never inspects the values flowing
|
|
29
|
+
through it. `vcti.flow.data` binds that framework to the
|
|
30
|
+
[vcti-datanode](https://github.com/vcollab/vcti-python-datanode) `DataNode`
|
|
31
|
+
payload (data plus layered attributes behind a data source), so you get
|
|
32
|
+
familiar, ready-bound node kinds — `Source`, `Transformer`, `Reducer`, `Sink` —
|
|
33
|
+
instead of writing `Source[DataNode]` everywhere, plus `from_array` (eager) and
|
|
34
|
+
`ArraySource` (lazy) for building payloads.
|
|
35
|
+
|
|
36
|
+
These node-kind names are the `vcti.flow` kinds bound to `DataNode`: in this
|
|
37
|
+
package `Source` *is* `Source[DataNode]`. It is a flow leaf node — distinct from
|
|
38
|
+
`vcti.datanode.DataSource`, the array-source ABC re-exported here as
|
|
39
|
+
`EagerDataSource` / `LazyDataSource`.
|
|
40
|
+
|
|
41
|
+
Nodes that need a *structured* (named-field) array — field-wise merge and
|
|
42
|
+
row-keyed iteration — live in the `vcti.flow.data.fields` submodule; the base
|
|
43
|
+
binding makes no assumption about array shape.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install vcti-dataflow
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### In `pyproject.toml` dependencies
|
|
52
|
+
|
|
53
|
+
```toml
|
|
54
|
+
dependencies = [
|
|
55
|
+
"vcti-dataflow>=2.0.0",
|
|
56
|
+
]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import numpy as np
|
|
65
|
+
from vcti.flow.data import Source, Transformer, DataNode, from_array
|
|
66
|
+
|
|
67
|
+
# A source produces a DataNode
|
|
68
|
+
class Stress(Source):
|
|
69
|
+
def load(self) -> DataNode:
|
|
70
|
+
return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
|
|
71
|
+
|
|
72
|
+
# A transformer maps one DataNode to another
|
|
73
|
+
class Scale(Transformer):
|
|
74
|
+
def __init__(self, factor: float) -> None:
|
|
75
|
+
super().__init__()
|
|
76
|
+
self.factor = factor
|
|
77
|
+
|
|
78
|
+
def transform(self, record: DataNode) -> DataNode:
|
|
79
|
+
return from_array(record.load() * self.factor, record.attributes)
|
|
80
|
+
|
|
81
|
+
result = Scale(2.0).connect(Stress()).execute()
|
|
82
|
+
result.load() # array([2., 4., 6.])
|
|
83
|
+
result.attributes["units"] # "MPa"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Reach for the array only when you need it (`record.load()`); a leaf source can
|
|
87
|
+
hand back a `LazyDataSource`-backed node to defer a heavy read.
|
|
88
|
+
|
|
89
|
+
### Structured-array nodes
|
|
90
|
+
|
|
91
|
+
The `vcti.flow.data.fields` submodule adds nodes that assume a structured
|
|
92
|
+
(named-field) array — building tables, naming/selecting/computing fields,
|
|
93
|
+
merging field groups, and row-keyed iteration:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from vcti.flow.data import ArraySource # lazy leaf source (base binding)
|
|
97
|
+
from vcti.flow.data.fields import (
|
|
98
|
+
RowTableSource, NameFields, SelectFields, ComputeFields,
|
|
99
|
+
RenameFields, DropFields, CastFields, MergeFields, for_each_field,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Build a structured table from dict rows (lazy — rows read on load())
|
|
103
|
+
mats = RowTableSource(lambda: material_rows(reader),
|
|
104
|
+
columns={"id": "MAT_ID", "EX": "Young's Modulus"})
|
|
105
|
+
|
|
106
|
+
# Name plain columns, select/rename, compute
|
|
107
|
+
coords = NameFields(["X", "Y", "Z"]).connect(ArraySource(lambda: reader.coords()))
|
|
108
|
+
picked = SelectFields({"X": "x", "Y": "y"}).connect(coords)
|
|
109
|
+
mag = ComputeFields({"mag": lambda a: np.hypot(a["X"], a["Y"])}).connect(coords)
|
|
110
|
+
|
|
111
|
+
# Rename, drop, cast (rename & drop return views — no data copy)
|
|
112
|
+
renamed = RenameFields({"X": "x"}).connect(coords) # rename, keep the rest
|
|
113
|
+
trimmed = DropFields(["Z"]).connect(coords) # drop, keep the rest
|
|
114
|
+
narrow = CastFields({"X": "f4"}).connect(coords) # change dtypes
|
|
115
|
+
|
|
116
|
+
# Merge field groups (same row count) into one structured array
|
|
117
|
+
combined = MergeFields().connect(ids).connect(coords).execute()
|
|
118
|
+
|
|
119
|
+
# One flow per row, keyed by a field
|
|
120
|
+
for case_id, flow in for_each_field(cases, build_case_flow, key_field="ID"):
|
|
121
|
+
flow.execute()
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## API
|
|
127
|
+
|
|
128
|
+
| Symbol | Purpose |
|
|
129
|
+
|--------|---------|
|
|
130
|
+
| `Source` / `Transformer` / `Reducer` / `Sink` | `vcti.flow` node kinds bound to `DataNode` |
|
|
131
|
+
| `Observer` | `vcti.flow` observer bound to `DataNode` |
|
|
132
|
+
| `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
|
|
133
|
+
| `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array (the lazy counterpart of `from_array`) |
|
|
134
|
+
| `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` for convenience |
|
|
135
|
+
| `fields.RowTableSource` | Lazy leaf source — a structured table from dict rows |
|
|
136
|
+
| `fields.NameFields` / `fields.SelectFields` / `fields.ComputeFields` | Name plain columns, select/rename fields, append/replace computed fields |
|
|
137
|
+
| `fields.RenameFields` / `fields.DropFields` / `fields.CastFields` | Rename or drop fields (views, no copy), or change field dtypes |
|
|
138
|
+
| `fields.MergeFields` | Field-wise merge of structured arrays |
|
|
139
|
+
| `fields.for_each_field` / `fields.field_items` | Row-keyed fan-out over a structured array |
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Dependencies
|
|
144
|
+
|
|
145
|
+
- [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
|
|
146
|
+
- [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
|
|
147
|
+
- [numpy](https://numpy.org/) (>=1.24)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
vcti/flow/data/__init__.py,sha256=TBauzupjsaAMKvlZrGTLizBBp0eUaADCi9GuY2hx7MM,1026
|
|
2
|
+
vcti/flow/data/aliases.py,sha256=2Tdby1qYkSig9JkhuZe1hjSXjxXaNbGi6OG4U5_WpJQ,1228
|
|
3
|
+
vcti/flow/data/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
vcti/flow/data/record.py,sha256=b6k_sg6ZzTNuVvv_diVAyOUK6k-i1DSi_KognAgS8_o,1287
|
|
5
|
+
vcti/flow/data/sources.py,sha256=M0MTCcnWBfBsXsJ6sXbP5cBgD5SJLhLRMVbp4gl8RbI,1225
|
|
6
|
+
vcti/flow/data/fields/__init__.py,sha256=c9bdahN3I1PtsRUb_H0gX_b3u77d0c2BgYy_rq0FHjI,1277
|
|
7
|
+
vcti/flow/data/fields/iterate.py,sha256=v0p_JICHOP_15RcSl20NSz7QCfBr-7o0o_5KBqVhIRU,1691
|
|
8
|
+
vcti/flow/data/fields/merge.py,sha256=OoKEQCCuPA-gs2U5y-NTsoPzhz07aH2ugmzsVn38UpI,1845
|
|
9
|
+
vcti/flow/data/fields/sources.py,sha256=FrEb1wupcFezUhU8dEavdwi5_fp3TonY9IuSbiaycFc,3455
|
|
10
|
+
vcti/flow/data/fields/transforms.py,sha256=dK87paRSFVC7J1GdbaHuiyYbyuruS_pl3ozqrCBr_Bw,8358
|
|
11
|
+
vcti_dataflow-2.0.0.dist-info/licenses/LICENSE,sha256=gqRj-E4YRsT7mZ52W76LG6aTTFv6iEOK9QR_fV5EdrI,369
|
|
12
|
+
vcti_dataflow-2.0.0.dist-info/METADATA,sha256=wwypvajpJ1Rio6bPWzFO2DnirS-2TbMSjendrgOT-88,5705
|
|
13
|
+
vcti_dataflow-2.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
vcti_dataflow-2.0.0.dist-info/top_level.txt,sha256=Jl6AIAI3Xhru_BFQAhD_13VeXLmZQd9BqBNUaAKNgKs,5
|
|
15
|
+
vcti_dataflow-2.0.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
16
|
+
vcti_dataflow-2.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Copyright (c) 2018-2026 Visual Collaboration Technologies Inc.
|
|
2
|
+
All Rights Reserved.
|
|
3
|
+
|
|
4
|
+
This software is proprietary and confidential. Unauthorized copying,
|
|
5
|
+
distribution, or use of this software, via any medium, is strictly
|
|
6
|
+
prohibited. Access is granted only to authorized VCollab developers
|
|
7
|
+
and individuals explicitly authorized by Visual Collaboration
|
|
8
|
+
Technologies Inc.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vcti
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|