vcti-dataflow 2.0.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. vcti_dataflow-3.0.0/PKG-INFO +167 -0
  2. vcti_dataflow-3.0.0/README.md +146 -0
  3. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/pyproject.toml +6 -4
  4. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/__init__.py +3 -3
  5. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/__init__.py +73 -0
  6. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/aliases.py +26 -0
  7. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/iterate.py +36 -0
  8. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/py.typed +0 -0
  9. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/reducers.py +116 -0
  10. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/sinks.py +45 -0
  11. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/sources.py +70 -0
  12. vcti_dataflow-3.0.0/src/vcti/flow/fieldset/transforms.py +192 -0
  13. vcti_dataflow-3.0.0/src/vcti_dataflow.egg-info/PKG-INFO +167 -0
  14. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/SOURCES.txt +9 -7
  15. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/requires.txt +1 -0
  16. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/tests/test_data.py +16 -0
  17. vcti_dataflow-3.0.0/tests/test_fieldset.py +350 -0
  18. vcti_dataflow-2.0.0/PKG-INFO +0 -147
  19. vcti_dataflow-2.0.0/README.md +0 -127
  20. vcti_dataflow-2.0.0/src/vcti/flow/data/fields/__init__.py +0 -41
  21. vcti_dataflow-2.0.0/src/vcti/flow/data/fields/iterate.py +0 -48
  22. vcti_dataflow-2.0.0/src/vcti/flow/data/fields/merge.py +0 -53
  23. vcti_dataflow-2.0.0/src/vcti/flow/data/fields/sources.py +0 -94
  24. vcti_dataflow-2.0.0/src/vcti/flow/data/fields/transforms.py +0 -203
  25. vcti_dataflow-2.0.0/src/vcti_dataflow.egg-info/PKG-INFO +0 -147
  26. vcti_dataflow-2.0.0/tests/test_fields.py +0 -95
  27. vcti_dataflow-2.0.0/tests/test_shaping.py +0 -309
  28. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/LICENSE +0 -0
  29. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/setup.cfg +0 -0
  30. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/aliases.py +0 -0
  31. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/py.typed +0 -0
  32. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/record.py +0 -0
  33. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti/flow/data/sources.py +0 -0
  34. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/dependency_links.txt +0 -0
  35. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/top_level.txt +0 -0
  36. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/src/vcti_dataflow.egg-info/zip-safe +0 -0
  37. {vcti_dataflow-2.0.0 → vcti_dataflow-3.0.0}/tests/test_version.py +0 -0
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.4
2
+ Name: vcti-dataflow
3
+ Version: 3.0.0
4
+ Summary: vcti-flow bindings for array data: a DataNode payload binding and a FieldSet table binding (sources, transformers, reducers, sinks).
5
+ Author: Visual Collaboration Technologies Inc.
6
+ Requires-Python: <3.15,>=3.12
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: vcti-flow>=2.0.0
10
+ Requires-Dist: vcti-datanode>=2.0.0
11
+ Requires-Dist: vcti-fieldset[datanode]>=2.0.0
12
+ Requires-Dist: numpy>=1.24
13
+ Provides-Extra: test
14
+ Requires-Dist: pytest; extra == "test"
15
+ Requires-Dist: pytest-cov; extra == "test"
16
+ Provides-Extra: lint
17
+ Requires-Dist: ruff; extra == "lint"
18
+ Provides-Extra: typecheck
19
+ Requires-Dist: mypy; extra == "typecheck"
20
+ Dynamic: license-file
21
+
22
+ # Data Flow
23
+
24
+ vcti-flow bindings for array data: a **DataNode** payload binding and a
25
+ **FieldSet** table binding — sources, transformers, reducers, and sinks.
26
+
27
+ ## Overview
28
+
29
+ [vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
30
+ framework for composing flow graphs — it never inspects the values flowing
31
+ through it. This package binds that framework to two concrete payloads so you
32
+ subclass ready-bound node kinds instead of writing `Source[...]` everywhere:
33
+
34
+ - **`vcti.flow.data`** — the [vcti-datanode](https://github.com/vcollab/vcti-python-datanode)
35
+ `DataNode` binding: one array plus layered attributes behind a data source.
36
+ Node kinds `Source` / `Transformer` / `Reducer` / `Sink` / `Observer`, plus
37
+ `from_array` (eager) and `ArraySource` (lazy) payload builders.
38
+ - **`vcti.flow.fieldset`** — the [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset)
39
+ `FieldSet` binding: a **table** of named columns. This is where column and row
40
+ reshaping lives — select / drop / rename / cast / compute, filter / sort /
41
+ slice, and the N→1 combiners merge / concat. The nodes are thin wrappers over
42
+ FieldSet's own copy-free `project()` / `rows()` builders.
43
+
44
+ Use the DataNode binding for single-array flows; use the FieldSet binding to
45
+ reshape multi-column tables. `DataNodesSource` / `WriteDataNodes` bridge between
46
+ them (a group of DataNodes ⇆ a FieldSet).
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install vcti-dataflow
52
+ ```
53
+
54
+ ### In `pyproject.toml` dependencies
55
+
56
+ ```toml
57
+ dependencies = [
58
+ "vcti-dataflow>=3.0.0",
59
+ ]
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Quick Start — DataNode binding
65
+
66
+ ```python
67
+ import numpy as np
68
+ from vcti.flow.data import Source, Transformer, DataNode, from_array
69
+
70
+ # A source produces a DataNode
71
+ class Stress(Source):
72
+ def load(self) -> DataNode:
73
+ return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
74
+
75
+ # A transformer maps one DataNode to another
76
+ class Scale(Transformer):
77
+ def __init__(self, factor: float) -> None:
78
+ super().__init__()
79
+ self.factor = factor
80
+
81
+ def transform(self, record: DataNode) -> DataNode:
82
+ return from_array(record.load() * self.factor, record.attributes)
83
+
84
+ result = Scale(2.0).connect(Stress()).execute()
85
+ result.load() # array([2., 4., 6.])
86
+ result.attributes["units"] # "MPa"
87
+ ```
88
+
89
+ Reach for the array only when you need it (`record.load()`); a leaf source can
90
+ hand back a `LazyDataSource`-backed node (or use `ArraySource`) to defer a heavy
91
+ read.
92
+
93
+ ## Quick Start — FieldSet binding (tables)
94
+
95
+ The `vcti.flow.fieldset` nodes carry a `FieldSet` (named columns) and reshape it.
96
+ Column ops (`select` / `drop` / `rename` / `cast`) share columns by reference —
97
+ no data copy; `cast` is a lazy per-column cast. Row ops fold to a single index.
98
+
99
+ ```python
100
+ import numpy as np
101
+ from vcti.datanode import DataNode, EagerDataSource
102
+ from vcti.flow.fieldset import (
103
+ DataNodesSource, SelectFields, RenameFields, CastFields, ComputeFields,
104
+ FilterRows, SortRows, MergeFields, ConcatRows, WriteDataNodes,
105
+ )
106
+
107
+ # Read a group of DataNodes as one FieldSet (lazy — nothing materialises here)
108
+ src = DataNodesSource([
109
+ DataNode(name="id", data_source=EagerDataSource(np.array([1, 2, 3]))),
110
+ DataNode(name="stress", data_source=EagerDataSource(np.array([10.0, 20.0, 30.0]))),
111
+ ])
112
+
113
+ # Reshape: rename, add a computed column, filter and sort rows
114
+ flow = RenameFields({"stress": "s"}).connect(src)
115
+ flow = ComputeFields("s_kpa = s * 1000").connect(flow)
116
+ flow = FilterRows("s > 10").connect(flow)
117
+ flow = SortRows("s", descending=True).connect(flow)
118
+
119
+ result = flow.execute() # a FieldSet
120
+ result.get_values("s_kpa") # array([30000., 20000.])
121
+
122
+ # Combine tables — N→1 reducers own the policy (collision / schema)
123
+ wide = MergeFields().connect(flowA).connect(flowB) # union columns
124
+ tall = ConcatRows().connect(flowA).connect(flowB) # stack rows
125
+
126
+ # Write out: FieldSet -> DataNodes -> your writer (I/O stays in your app)
127
+ WriteDataNodes(lambda node: write_to_h5(node)).connect(flow).execute()
128
+ ```
129
+
130
+ Column reshaping is `project()`-backed; row reshaping is `rows()`-backed. See
131
+ [docs/patterns.md](docs/patterns.md) for the full recipe set.
132
+
133
+ ---
134
+
135
+ ## API
136
+
137
+ ### `vcti.flow.data` — DataNode binding
138
+
139
+ | Symbol | Purpose |
140
+ |--------|---------|
141
+ | `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `DataNode` |
142
+ | `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
143
+ | `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array |
144
+ | `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` |
145
+
146
+ ### `vcti.flow.fieldset` — FieldSet binding
147
+
148
+ | Symbol | Purpose |
149
+ |--------|---------|
150
+ | `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `FieldSet` |
151
+ | `SelectFields` / `DropFields` / `RenameFields` | Keep / remove / rename columns (share sources, no copy) |
152
+ | `CastFields` / `ComputeFields` / `FreezeFields` | Cast dtypes (lazy), add expression columns, bake an expression to data |
153
+ | `FilterRows` / `SortRows` / `HeadRows` / `SliceRows` | Row selection (fold to one index; slice stays a view) |
154
+ | `MergeFields` / `ConcatRows` | N→1: union columns / stack rows (own collision & schema policy) |
155
+ | `FieldSetSource` | Leaf source over an existing `FieldSet` (or a callable returning one) |
156
+ | `DataNodesSource` / `WriteDataNodes` | Boundary: DataNodes → FieldSet / FieldSet → DataNodes |
157
+ | `for_each_group(source, key_field, factory)` | Fan out into one flow per group |
158
+ | `from_datanodes` / `to_datanodes` | Re-exported from `vcti-fieldset` |
159
+
160
+ ---
161
+
162
+ ## Dependencies
163
+
164
+ - [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
165
+ - [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
166
+ - [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset) `[datanode]` (>=2.0.0) — the `FieldSet` table payload and its copy-free reshaping
167
+ - [numpy](https://numpy.org/) (>=1.24)
@@ -0,0 +1,146 @@
1
+ # Data Flow
2
+
3
+ vcti-flow bindings for array data: a **DataNode** payload binding and a
4
+ **FieldSet** table binding — sources, transformers, reducers, and sinks.
5
+
6
+ ## Overview
7
+
8
+ [vcti-flow](https://github.com/vcollab/vcti-python-flow) is a payload-agnostic
9
+ framework for composing flow graphs — it never inspects the values flowing
10
+ through it. This package binds that framework to two concrete payloads so you
11
+ subclass ready-bound node kinds instead of writing `Source[...]` everywhere:
12
+
13
+ - **`vcti.flow.data`** — the [vcti-datanode](https://github.com/vcollab/vcti-python-datanode)
14
+ `DataNode` binding: one array plus layered attributes behind a data source.
15
+ Node kinds `Source` / `Transformer` / `Reducer` / `Sink` / `Observer`, plus
16
+ `from_array` (eager) and `ArraySource` (lazy) payload builders.
17
+ - **`vcti.flow.fieldset`** — the [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset)
18
+ `FieldSet` binding: a **table** of named columns. This is where column and row
19
+ reshaping lives — select / drop / rename / cast / compute, filter / sort /
20
+ slice, and the N→1 combiners merge / concat. The nodes are thin wrappers over
21
+ FieldSet's own copy-free `project()` / `rows()` builders.
22
+
23
+ Use the DataNode binding for single-array flows; use the FieldSet binding to
24
+ reshape multi-column tables. `DataNodesSource` / `WriteDataNodes` bridge between
25
+ them (a group of DataNodes ⇆ a FieldSet).
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install vcti-dataflow
31
+ ```
32
+
33
+ ### In `pyproject.toml` dependencies
34
+
35
+ ```toml
36
+ dependencies = [
37
+ "vcti-dataflow>=3.0.0",
38
+ ]
39
+ ```
40
+
41
+ ---
42
+
43
+ ## Quick Start — DataNode binding
44
+
45
+ ```python
46
+ import numpy as np
47
+ from vcti.flow.data import Source, Transformer, DataNode, from_array
48
+
49
+ # A source produces a DataNode
50
+ class Stress(Source):
51
+ def load(self) -> DataNode:
52
+ return from_array(np.array([1.0, 2.0, 3.0]), {"units": "MPa"})
53
+
54
+ # A transformer maps one DataNode to another
55
+ class Scale(Transformer):
56
+ def __init__(self, factor: float) -> None:
57
+ super().__init__()
58
+ self.factor = factor
59
+
60
+ def transform(self, record: DataNode) -> DataNode:
61
+ return from_array(record.load() * self.factor, record.attributes)
62
+
63
+ result = Scale(2.0).connect(Stress()).execute()
64
+ result.load() # array([2., 4., 6.])
65
+ result.attributes["units"] # "MPa"
66
+ ```
67
+
68
+ Reach for the array only when you need it (`record.load()`); a leaf source can
69
+ hand back a `LazyDataSource`-backed node (or use `ArraySource`) to defer a heavy
70
+ read.
71
+
72
+ ## Quick Start — FieldSet binding (tables)
73
+
74
+ The `vcti.flow.fieldset` nodes carry a `FieldSet` (named columns) and reshape it.
75
+ Column ops (`select` / `drop` / `rename` / `cast`) share columns by reference —
76
+ no data copy; `cast` is a lazy per-column cast. Row ops fold to a single index.
77
+
78
+ ```python
79
+ import numpy as np
80
+ from vcti.datanode import DataNode, EagerDataSource
81
+ from vcti.flow.fieldset import (
82
+ DataNodesSource, SelectFields, RenameFields, CastFields, ComputeFields,
83
+ FilterRows, SortRows, MergeFields, ConcatRows, WriteDataNodes,
84
+ )
85
+
86
+ # Read a group of DataNodes as one FieldSet (lazy — nothing materialises here)
87
+ src = DataNodesSource([
88
+ DataNode(name="id", data_source=EagerDataSource(np.array([1, 2, 3]))),
89
+ DataNode(name="stress", data_source=EagerDataSource(np.array([10.0, 20.0, 30.0]))),
90
+ ])
91
+
92
+ # Reshape: rename, add a computed column, filter and sort rows
93
+ flow = RenameFields({"stress": "s"}).connect(src)
94
+ flow = ComputeFields("s_kpa = s * 1000").connect(flow)
95
+ flow = FilterRows("s > 10").connect(flow)
96
+ flow = SortRows("s", descending=True).connect(flow)
97
+
98
+ result = flow.execute() # a FieldSet
99
+ result.get_values("s_kpa") # array([30000., 20000.])
100
+
101
+ # Combine tables — N→1 reducers own the policy (collision / schema)
102
+ wide = MergeFields().connect(flowA).connect(flowB) # union columns
103
+ tall = ConcatRows().connect(flowA).connect(flowB) # stack rows
104
+
105
+ # Write out: FieldSet -> DataNodes -> your writer (I/O stays in your app)
106
+ WriteDataNodes(lambda node: write_to_h5(node)).connect(flow).execute()
107
+ ```
108
+
109
+ Column reshaping is `project()`-backed; row reshaping is `rows()`-backed. See
110
+ [docs/patterns.md](docs/patterns.md) for the full recipe set.
111
+
112
+ ---
113
+
114
+ ## API
115
+
116
+ ### `vcti.flow.data` — DataNode binding
117
+
118
+ | Symbol | Purpose |
119
+ |--------|---------|
120
+ | `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `DataNode` |
121
+ | `from_array(array, attributes=None)` | Build a `DataNode` from an in-memory array (eager) |
122
+ | `ArraySource(load_fn, attributes=None)` | Lazy leaf source over a callable returning an array |
123
+ | `DataNode` / `EagerDataSource` / `LazyDataSource` | Re-exported from `vcti-datanode` |
124
+
125
+ ### `vcti.flow.fieldset` — FieldSet binding
126
+
127
+ | Symbol | Purpose |
128
+ |--------|---------|
129
+ | `Source` / `Transformer` / `Reducer` / `Sink` / `Observer` | `vcti.flow` kinds bound to `FieldSet` |
130
+ | `SelectFields` / `DropFields` / `RenameFields` | Keep / remove / rename columns (share sources, no copy) |
131
+ | `CastFields` / `ComputeFields` / `FreezeFields` | Cast dtypes (lazy), add expression columns, bake an expression to data |
132
+ | `FilterRows` / `SortRows` / `HeadRows` / `SliceRows` | Row selection (fold to one index; slice stays a view) |
133
+ | `MergeFields` / `ConcatRows` | N→1: union columns / stack rows (own collision & schema policy) |
134
+ | `FieldSetSource` | Leaf source over an existing `FieldSet` (or a callable returning one) |
135
+ | `DataNodesSource` / `WriteDataNodes` | Boundary: DataNodes → FieldSet / FieldSet → DataNodes |
136
+ | `for_each_group(source, key_field, factory)` | Fan out into one flow per group |
137
+ | `from_datanodes` / `to_datanodes` | Re-exported from `vcti-fieldset` |
138
+
139
+ ---
140
+
141
+ ## Dependencies
142
+
143
+ - [vcti-flow](https://github.com/vcollab/vcti-python-flow) (>=2.0.0) — the generic framework
144
+ - [vcti-datanode](https://github.com/vcollab/vcti-python-datanode) (>=2.0.0) — the `DataNode` payload
145
+ - [vcti-fieldset](https://github.com/vcollab/vcti-python-fieldset) `[datanode]` (>=2.0.0) — the `FieldSet` table payload and its copy-free reshaping
146
+ - [numpy](https://numpy.org/) (>=1.24)
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vcti-dataflow"
7
- version = "2.0.0"
8
- description = "The DataNode binding of vcti-flow: sources, transformers, reducers, and combiners for vcti-datanode payloads."
7
+ version = "3.0.0"
8
+ description = "vcti-flow bindings for array data: a DataNode payload binding and a FieldSet table binding (sources, transformers, reducers, sinks)."
9
9
  readme = "README.md"
10
10
  authors = [
11
11
  {name = "Visual Collaboration Technologies Inc."}
@@ -14,16 +14,18 @@ requires-python = ">=3.12,<3.15"
14
14
  dependencies = [
15
15
  "vcti-flow>=2.0.0",
16
16
  "vcti-datanode>=2.0.0",
17
+ "vcti-fieldset[datanode]>=2.0.0",
17
18
  "numpy>=1.24",
18
19
  ]
19
20
 
20
21
  [tool.setuptools.packages.find]
21
22
  where = ["src"]
22
- include = ["vcti.flow.data", "vcti.flow.data.*"]
23
+ include = ["vcti.flow.data", "vcti.flow.data.*", "vcti.flow.fieldset", "vcti.flow.fieldset.*"]
23
24
  namespaces = true
24
25
 
25
26
  [tool.setuptools.package-data]
26
27
  "vcti.flow.data" = ["py.typed"]
28
+ "vcti.flow.fieldset" = ["py.typed"]
27
29
 
28
30
  [project.optional-dependencies]
29
31
  test = ["pytest", "pytest-cov"]
@@ -34,7 +36,7 @@ typecheck = ["mypy"]
34
36
  zip-safe = true
35
37
 
36
38
  [tool.pytest.ini_options]
37
- addopts = "--cov=vcti.flow.data --cov-report=term-missing --cov-fail-under=95"
39
+ addopts = "--cov=vcti.flow.data --cov=vcti.flow.fieldset --cov-report=term-missing --cov-fail-under=95"
38
40
 
39
41
  [tool.mypy]
40
42
  python_version = "3.12"
@@ -4,9 +4,9 @@
4
4
 
5
5
  DataNode-bound node kinds (``Source``, ``Transformer``, ``Reducer``, ``Sink``,
6
6
  ``Observer``) over the generic ``vcti.flow`` framework, plus ``from_array`` (eager)
7
- and ``ArraySource`` (lazy) for building ``DataNode`` payloads. Structured-array-
8
- specific nodes (field merge, row iteration, field shaping) live in the
9
- ``vcti.flow.data.fields`` submodule.
7
+ and ``ArraySource`` (lazy) for building ``DataNode`` payloads. This binding is for
8
+ single-array flows; table (named-column) reshaping lives in the sibling
9
+ ``vcti.flow.fieldset`` binding over a ``FieldSet`` payload.
10
10
  """
11
11
 
12
12
  from importlib.metadata import version
@@ -0,0 +1,73 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """vcti.flow.fieldset — the FieldSet binding of vcti.flow.
4
+
5
+ FieldSet-bound node kinds (``Source``, ``Transformer``, ``Reducer``, ``Sink``,
6
+ ``Observer``) over the generic ``vcti.flow`` framework, plus flow nodes that wrap
7
+ ``vcti.fieldset.FieldSet``'s copy-free reshaping:
8
+
9
+ - Column transformers (1->1, over ``FieldSet.project()`` / ``expressions``):
10
+ ``SelectFields``, ``DropFields``, ``RenameFields``, ``CastFields``,
11
+ ``ComputeFields``, ``FreezeFields``.
12
+ - Row transformers (1->1, over ``FieldSet.rows()``): ``FilterRows``,
13
+ ``SortRows``, ``HeadRows``, ``SliceRows``.
14
+ - Reducers (N->1): ``MergeFields`` (union columns), ``ConcatRows`` (stack rows).
15
+ - Sources: ``FieldSetSource`` (an existing FieldSet) and ``DataNodesSource``
16
+ (DataNodes -> FieldSet).
17
+ - Sink / boundary: ``WriteDataNodes`` (FieldSet -> DataNodes), plus re-exported
18
+ ``from_datanodes`` / ``to_datanodes``.
19
+ - Group fan-out: ``for_each_group``.
20
+ """
21
+
22
+ from importlib.metadata import version
23
+
24
+ from vcti.fieldset import FieldSet
25
+ from vcti.fieldset.datanode import from_datanodes, to_datanodes
26
+
27
+ from .aliases import Observer, Reducer, Sink, Source, Transformer
28
+ from .iterate import for_each_group
29
+ from .reducers import ConcatRows, MergeFields
30
+ from .sinks import WriteDataNodes
31
+ from .sources import DataNodesSource, FieldSetSource
32
+ from .transforms import (
33
+ CastFields,
34
+ ComputeFields,
35
+ DropFields,
36
+ FilterRows,
37
+ FreezeFields,
38
+ HeadRows,
39
+ RenameFields,
40
+ SelectFields,
41
+ SliceRows,
42
+ SortRows,
43
+ )
44
+
45
+ __version__ = version("vcti-dataflow")
46
+
47
+ __all__ = [
48
+ "__version__",
49
+ "CastFields",
50
+ "ComputeFields",
51
+ "ConcatRows",
52
+ "DataNodesSource",
53
+ "DropFields",
54
+ "FieldSet",
55
+ "FieldSetSource",
56
+ "FilterRows",
57
+ "FreezeFields",
58
+ "HeadRows",
59
+ "MergeFields",
60
+ "Observer",
61
+ "Reducer",
62
+ "RenameFields",
63
+ "SelectFields",
64
+ "Sink",
65
+ "SliceRows",
66
+ "SortRows",
67
+ "Source",
68
+ "Transformer",
69
+ "WriteDataNodes",
70
+ "for_each_group",
71
+ "from_datanodes",
72
+ "to_datanodes",
73
+ ]
@@ -0,0 +1,26 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """FieldSet-bound aliases for the vcti.flow node kinds.
4
+
5
+ These bind the generic ``vcti.flow`` framework to the ``vcti.fieldset.FieldSet``
6
+ payload, so authors subclass ``Source`` / ``Transformer`` / … — the
7
+ ``vcti.flow.fieldset`` spelling of ``Source[FieldSet]`` /
8
+ ``Transformer[FieldSet, FieldSet]`` — instead of repeating the type parameter.
9
+ They are plain assignments (not PEP 695 ``type`` aliases) so they remain usable
10
+ as base classes.
11
+
12
+ The names intentionally shadow the generic ``vcti.flow.core`` kinds: within this
13
+ binding ``Transformer`` *is* ``Transformer[FieldSet, FieldSet]``. This is the
14
+ FieldSet counterpart of the ``DataNode`` binding in ``vcti.flow.data``.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import vcti.flow.core as core
20
+ from vcti.fieldset import FieldSet
21
+
22
+ Source = core.Source[FieldSet]
23
+ Transformer = core.Transformer[FieldSet, FieldSet]
24
+ Reducer = core.Reducer[FieldSet, FieldSet]
25
+ Sink = core.Sink[FieldSet]
26
+ Observer = core.Observer[FieldSet]
@@ -0,0 +1,36 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Group fan-out over a FieldSet, as a flow combinator."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from collections.abc import Callable, Iterator
8
+ from typing import Any
9
+
10
+ from vcti.fieldset import FieldSet
11
+ from vcti.flow.core import Node
12
+
13
+
14
+ def for_each_group[U](
15
+ source: Node[FieldSet],
16
+ key_field: str,
17
+ factory: Callable[[FieldSet], Node[U]],
18
+ ) -> Iterator[tuple[Any, Node[U]]]:
19
+ """Fan a FieldSet out into one flow per group, keyed by *key_field*.
20
+
21
+ Executes *source* once, partitions its rows with ``rows().group_by(key_field)``
22
+ (each group is a row-view FieldSet sharing the base's columns), and yields
23
+ ``(key, factory(group))`` per distinct key. The FieldSet-specific analogue of
24
+ ``vcti.flow.core.for_each`` for the grouped case.
25
+
26
+ Args:
27
+ source: A node producing the FieldSet to partition.
28
+ key_field: Field (or expression) name whose distinct values define groups.
29
+ factory: Builds a downstream node from each group's FieldSet.
30
+
31
+ Yields:
32
+ ``(key_value, node)`` pairs, one per group, key order ascending.
33
+ """
34
+ base = source.execute()
35
+ for key, group in base.rows().group_by(key_field):
36
+ yield key, factory(group)
File without changes
@@ -0,0 +1,116 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """N->1 reducers over FieldSet payloads.
4
+
5
+ These combine *multiple* FieldSets into one — the arity FieldSet itself does not
6
+ cover (its ``project`` / ``rows`` are 1->1). ``MergeFields`` unions columns
7
+ (horizontal); ``ConcatRows`` stacks rows (vertical). Combination policy —
8
+ name-collision handling, schema matching — lives here, in the pipeline, not in
9
+ FieldSet.
10
+
11
+ Only stored fields are combined; registered **expressions are not carried** (they
12
+ are per-input and may reference dropped columns). Freeze an expression
13
+ (``FreezeFields``) before combining if you need it in the output. Per-field
14
+ properties (components, user attributes) of the combined fields are carried over.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import numpy as np
20
+ from vcti.fieldset import ArrayFieldSource, FieldSet
21
+
22
+ from .aliases import Reducer
23
+
24
+
25
+ def _carry_field_properties(src: FieldSet, dst: FieldSet, names: set[str]) -> None:
26
+ """Copy per-field properties (components + user attributes) for ``names``.
27
+
28
+ Reaches into the property store via the same ``to_dict`` / ``key_to_parts`` /
29
+ ``set`` path FieldSet's own projection uses. Keys missing an expected part are
30
+ skipped rather than raising, so a future store-schema change degrades to
31
+ "metadata not carried" instead of a hard error.
32
+ """
33
+ store = src.properties.store
34
+ mapper = store.key_mapper
35
+ for key, value in store.to_dict().items():
36
+ _, parts = mapper.key_to_parts(key)
37
+ column = parts.get("column")
38
+ scope = parts.get("scope")
39
+ name = parts.get("name")
40
+ if column in names and scope is not None and name is not None:
41
+ dst.properties.store.set(scope=scope, column=column, name=name, value=value)
42
+
43
+
44
+ class MergeFields(Reducer):
45
+ """Union the columns of several FieldSets into one (same row count).
46
+
47
+ Fields are shared by reference (zero-copy); row counts must match (enforced by
48
+ ``FieldSet``). ``on_conflict`` decides duplicate output names:
49
+
50
+ - ``"error"`` (default) — raise on a name that already exists.
51
+ - ``"first"`` — keep the first occurrence, ignore later ones.
52
+ - ``"last"`` — later occurrences replace earlier ones.
53
+ """
54
+
55
+ def __init__(self, *, on_conflict: str = "error", name: str | None = None) -> None:
56
+ super().__init__(name=name)
57
+ if on_conflict not in ("error", "first", "last"):
58
+ raise ValueError(
59
+ f"on_conflict must be 'error', 'first', or 'last', got {on_conflict!r}"
60
+ )
61
+ self._on_conflict = on_conflict
62
+
63
+ def reduce(self, records: list[FieldSet]) -> FieldSet:
64
+ result = FieldSet()
65
+ origin: dict[str, FieldSet] = {}
66
+ for fs in records:
67
+ for field in fs.fields.names:
68
+ if field in result.fields:
69
+ if self._on_conflict == "error":
70
+ raise ValueError(f"MergeFields: duplicate field name {field!r}")
71
+ if self._on_conflict == "first":
72
+ continue
73
+ result.fields.remove(field) # "last" wins
74
+ result.fields.add(field, fs.fields.source(field))
75
+ origin[field] = fs
76
+ # Carry properties once per source (deep-copies the store once per source,
77
+ # not once per field): group each source with the fields it contributed.
78
+ grouped: dict[int, tuple[FieldSet, set[str]]] = {}
79
+ for field, src in origin.items():
80
+ grouped.setdefault(id(src), (src, set()))[1].add(field)
81
+ for src, fields in grouped.values():
82
+ _carry_field_properties(src, result, fields)
83
+ return result
84
+
85
+
86
+ class ConcatRows(Reducer):
87
+ """Stack the rows of several same-schema FieldSets into one.
88
+
89
+ Each stored field is concatenated across inputs (``numpy.concatenate`` — one
90
+ copy, unavoidable). All inputs must expose the **same stored field names**
91
+ (order-independent); the output takes the first input's field order. An input
92
+ with no stored fields therefore matches only when *every* input is field-less,
93
+ in which case an empty FieldSet is returned — a field-less input mixed with
94
+ data-bearing ones is a schema mismatch, not silently dropped. Registered
95
+ expressions are not concatenated; freeze them first (``FreezeFields``).
96
+
97
+ Raises:
98
+ ValueError: If inputs have different stored-field schemas.
99
+ """
100
+
101
+ def reduce(self, records: list[FieldSet]) -> FieldSet:
102
+ if not records:
103
+ return FieldSet()
104
+ names = records[0].fields.names
105
+ schema = set(names)
106
+ for fs in records[1:]:
107
+ if set(fs.fields.names) != schema:
108
+ raise ValueError(f"ConcatRows: inconsistent schema {fs.fields.names} != {names}")
109
+ if not names:
110
+ return FieldSet()
111
+ result = FieldSet()
112
+ for field in names:
113
+ stacked = np.concatenate([fs.get_values(field) for fs in records])
114
+ result.fields.add(field, ArrayFieldSource(stacked))
115
+ _carry_field_properties(records[0], result, schema)
116
+ return result
@@ -0,0 +1,45 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Sinks that export a FieldSet payload."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from collections.abc import Callable
8
+
9
+ from vcti.datanode import DataNode
10
+ from vcti.fieldset import FieldSet
11
+ from vcti.fieldset.datanode import to_datanodes
12
+
13
+ from .aliases import Sink
14
+
15
+
16
+ class WriteDataNodes(Sink):
17
+ """Export a FieldSet's fields as DataNodes and hand each to a writer.
18
+
19
+ Wraps ``vcti.fieldset.datanode.to_datanodes``: each field (and, by default,
20
+ each expression) becomes an eager ``DataNode`` — components map to
21
+ ``node.intrinsic['components']`` and per-field user attributes to the node's
22
+ enriched attributes. Each node is passed to *writer*; the actual I/O is the
23
+ caller's (this package does no I/O itself). Being a ``Sink``, it passes the
24
+ FieldSet through unchanged.
25
+
26
+ Args:
27
+ writer: Called once per exported node, ``writer(node) -> None``.
28
+ fields: Optional subset of field/expression names to export (default: all).
29
+ name: Optional node name.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ writer: Callable[[DataNode], None],
35
+ *,
36
+ fields: list[str] | None = None,
37
+ name: str | None = None,
38
+ ) -> None:
39
+ super().__init__(name=name)
40
+ self._writer = writer
41
+ self._fields = fields
42
+
43
+ def save(self, record: FieldSet) -> None:
44
+ for node in to_datanodes(record, fields=self._fields):
45
+ self._writer(node)