cf-datahive 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cf_datahive-0.1.0/PKG-INFO +143 -0
- cf_datahive-0.1.0/README.md +130 -0
- cf_datahive-0.1.0/pyproject.toml +28 -0
- cf_datahive-0.1.0/setup.cfg +4 -0
- cf_datahive-0.1.0/src/cf_datahive/__init__.py +16 -0
- cf_datahive-0.1.0/src/cf_datahive/client.py +927 -0
- cf_datahive-0.1.0/src/cf_datahive/manifest.py +197 -0
- cf_datahive-0.1.0/src/cf_datahive/policy.py +16 -0
- cf_datahive-0.1.0/src/cf_datahive.egg-info/PKG-INFO +143 -0
- cf_datahive-0.1.0/src/cf_datahive.egg-info/SOURCES.txt +12 -0
- cf_datahive-0.1.0/src/cf_datahive.egg-info/dependency_links.txt +1 -0
- cf_datahive-0.1.0/src/cf_datahive.egg-info/requires.txt +8 -0
- cf_datahive-0.1.0/src/cf_datahive.egg-info/top_level.txt +1 -0
- cf_datahive-0.1.0/tests/test_client.py +327 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cf-datahive
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Canonical result and measurement data storage APIs for Cogniflow
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: pyarrow>=12
|
|
8
|
+
Provides-Extra: pandas
|
|
9
|
+
Requires-Dist: pandas>=2.0; extra == "pandas"
|
|
10
|
+
Provides-Extra: test
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == "test"
|
|
12
|
+
Requires-Dist: pandas>=2.0; extra == "test"
|
|
13
|
+
|
|
14
|
+
# cf_datahive
|
|
15
|
+
|
|
16
|
+
`cf_datahive` is the Data Hive package boundary for Python-facing APIs/tooling around the canonical data hive root (`workspace/<data_hive>`).
|
|
17
|
+
|
|
18
|
+
## Boundary (Current Phase)
|
|
19
|
+
|
|
20
|
+
- Python package role (`sandcastle/cf_datahive`): read-oriented API/tooling/validation for pipeline-facing workflows.
|
|
21
|
+
- Native role (`sandcastle/cf_datahive/cpp`): write gatekeeper and only allowed writer under `workspace/data_hive`.
|
|
22
|
+
- Step packages must stay thin wrappers and call the native gatekeeper instead of implementing filesystem/parquet helpers.
|
|
23
|
+
|
|
24
|
+
## Development workflow
|
|
25
|
+
|
|
26
|
+
- Current development mode is source-first via `scripts/fresh_install.ps1`.
|
|
27
|
+
- The package can now be built and published independently without changing the read/write ownership boundary above.
|
|
28
|
+
|
|
29
|
+
## Canonical layout
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
workspace/
|
|
33
|
+
data_hive/
|
|
34
|
+
<pipeline_id>/
|
|
35
|
+
runs/
|
|
36
|
+
<run_id>/
|
|
37
|
+
manifest.json
|
|
38
|
+
tables/
|
|
39
|
+
<table_name>/
|
|
40
|
+
part-0000.parquet
|
|
41
|
+
part-0001.parquet
|
|
42
|
+
artifacts/
|
|
43
|
+
<artifact_name>
|
|
44
|
+
latest.txt
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
- `latest.txt` stores the committed `run_id` and is updated atomically.
|
|
48
|
+
- `manifest.json` is the SOT for run metadata, table metadata, file hashes, and artifact hashes.
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
|
|
55
|
+
from cf_datahive import DataHiveClient
|
|
56
|
+
|
|
57
|
+
workspace_root = Path("workspace")
|
|
58
|
+
client = DataHiveClient(str(workspace_root))
|
|
59
|
+
|
|
60
|
+
runs = client.list_runs("opcua_fifo_avg")
|
|
61
|
+
if runs:
|
|
62
|
+
latest = runs[0].run_id
|
|
63
|
+
manifest = client.load_manifest("opcua_fifo_avg", latest)
|
|
64
|
+
table = client.read_table("opcua_fifo_avg", latest, "measurements")
|
|
65
|
+
print(manifest.status, table.num_rows)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Manifest details
|
|
69
|
+
|
|
70
|
+
Each run stores a `RunManifest` (`schema_version="1.0"`) with:
|
|
71
|
+
|
|
72
|
+
- run lifecycle fields (`status`: `staged|committed|aborted`)
|
|
73
|
+
- table entries (`parquet`, schema fingerprint, row/file counts, optional file hashes)
|
|
74
|
+
- artifact entries (sha256, media type, size)
|
|
75
|
+
- optional `semantic_refs` placeholder map for future ontology links
|
|
76
|
+
|
|
77
|
+
Schema fingerprint is sha256 of Arrow schema serialization bytes.
|
|
78
|
+
|
|
79
|
+
## Guardrails
|
|
80
|
+
|
|
81
|
+
Run the repository guardrail check:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
python tools/check_datahive_guardrails.py
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The script performs C++/header scans and step-package checks that:
|
|
88
|
+
|
|
89
|
+
- use canonical `workspace/data_hive` literals outside the native gatekeeper location (hard fail)
|
|
90
|
+
- violate the thin-steps rule in `sandcastle/cf_basic_steps/*/src/*/cpp` (hard fail)
|
|
91
|
+
|
|
92
|
+
## Testing
|
|
93
|
+
|
|
94
|
+
Install test dependencies and run:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
pip install -e "sandcastle/cf_datahive[test]"
|
|
98
|
+
pytest -q sandcastle/cf_datahive/tests
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Published distribution name:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install cf-datahive
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Publishing
|
|
108
|
+
|
|
109
|
+
`cf_datahive` is published with the dedicated Windows workflow:
|
|
110
|
+
|
|
111
|
+
- Workflow: `.github/workflows/cf_datahive_windows_publish.yml`
|
|
112
|
+
- Package directory: `sandcastle/cf_datahive`
|
|
113
|
+
- PyPI tag: `cf-datahive-v<version>`
|
|
114
|
+
- TestPyPI tag: `cf-datahive-v<version>-test`
|
|
115
|
+
|
|
116
|
+
Local preflight:
|
|
117
|
+
|
|
118
|
+
```powershell
|
|
119
|
+
powershell -ExecutionPolicy Bypass -File scripts/mimic_windows_python_publish_workflow.ps1 `
|
|
120
|
+
-WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
|
|
121
|
+
-PackageDir sandcastle/cf_datahive `
|
|
122
|
+
-PythonExe py `
|
|
123
|
+
-PythonVersion 3.13
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Queue a dry-run dispatch:
|
|
127
|
+
|
|
128
|
+
```powershell
|
|
129
|
+
powershell -ExecutionPolicy Bypass -File scripts/queue_windows_python_publish_workflow.ps1 `
|
|
130
|
+
-WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
|
|
131
|
+
-PackageDir sandcastle/cf_datahive `
|
|
132
|
+
-PublishTarget testpypi `
|
|
133
|
+
-Ref main `
|
|
134
|
+
-RequireLocalPass `
|
|
135
|
+
-DryRun
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Do / Don't
|
|
139
|
+
|
|
140
|
+
- Do: use `DataHiveClient` read APIs (`list_runs`, `load_manifest`, `read_table`, `open_artifact`) for inspection and validation.
|
|
141
|
+
- Do: route pipeline write ownership through `cf_datahive_cpp` in the sink path.
|
|
142
|
+
- Don't: write parquet files or artifacts directly into the canonical data hive root from pipeline steps.
|
|
143
|
+
- Don't: bypass manifest updates.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# cf_datahive
|
|
2
|
+
|
|
3
|
+
`cf_datahive` is the Data Hive package boundary for Python-facing APIs/tooling around the canonical data hive root (`workspace/<data_hive>`).
|
|
4
|
+
|
|
5
|
+
## Boundary (Current Phase)
|
|
6
|
+
|
|
7
|
+
- Python package role (`sandcastle/cf_datahive`): read-oriented API/tooling/validation for pipeline-facing workflows.
|
|
8
|
+
- Native role (`sandcastle/cf_datahive/cpp`): write gatekeeper and only allowed writer under `workspace/data_hive`.
|
|
9
|
+
- Step packages must stay thin wrappers and call the native gatekeeper instead of implementing filesystem/parquet helpers.
|
|
10
|
+
|
|
11
|
+
## Development workflow
|
|
12
|
+
|
|
13
|
+
- Current development mode is source-first via `scripts/fresh_install.ps1`.
|
|
14
|
+
- The package can now be built and published independently without changing the read/write ownership boundary above.
|
|
15
|
+
|
|
16
|
+
## Canonical layout
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
workspace/
|
|
20
|
+
data_hive/
|
|
21
|
+
<pipeline_id>/
|
|
22
|
+
runs/
|
|
23
|
+
<run_id>/
|
|
24
|
+
manifest.json
|
|
25
|
+
tables/
|
|
26
|
+
<table_name>/
|
|
27
|
+
part-0000.parquet
|
|
28
|
+
part-0001.parquet
|
|
29
|
+
artifacts/
|
|
30
|
+
<artifact_name>
|
|
31
|
+
latest.txt
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
- `latest.txt` stores the committed `run_id` and is updated atomically.
|
|
35
|
+
- `manifest.json` is the SOT for run metadata, table metadata, file hashes, and artifact hashes.
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
|
|
42
|
+
from cf_datahive import DataHiveClient
|
|
43
|
+
|
|
44
|
+
workspace_root = Path("workspace")
|
|
45
|
+
client = DataHiveClient(str(workspace_root))
|
|
46
|
+
|
|
47
|
+
runs = client.list_runs("opcua_fifo_avg")
|
|
48
|
+
if runs:
|
|
49
|
+
latest = runs[0].run_id
|
|
50
|
+
manifest = client.load_manifest("opcua_fifo_avg", latest)
|
|
51
|
+
table = client.read_table("opcua_fifo_avg", latest, "measurements")
|
|
52
|
+
print(manifest.status, table.num_rows)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Manifest details
|
|
56
|
+
|
|
57
|
+
Each run stores a `RunManifest` (`schema_version="1.0"`) with:
|
|
58
|
+
|
|
59
|
+
- run lifecycle fields (`status`: `staged|committed|aborted`)
|
|
60
|
+
- table entries (`parquet`, schema fingerprint, row/file counts, optional file hashes)
|
|
61
|
+
- artifact entries (sha256, media type, size)
|
|
62
|
+
- optional `semantic_refs` placeholder map for future ontology links
|
|
63
|
+
|
|
64
|
+
Schema fingerprint is sha256 of Arrow schema serialization bytes.
|
|
65
|
+
|
|
66
|
+
## Guardrails
|
|
67
|
+
|
|
68
|
+
Run the repository guardrail check:
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
python tools/check_datahive_guardrails.py
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
The script performs C++/header scans and step-package checks that:
|
|
75
|
+
|
|
76
|
+
- use canonical `workspace/data_hive` literals outside the native gatekeeper location (hard fail)
|
|
77
|
+
- violate the thin-steps rule in `sandcastle/cf_basic_steps/*/src/*/cpp` (hard fail)
|
|
78
|
+
|
|
79
|
+
## Testing
|
|
80
|
+
|
|
81
|
+
Install test dependencies and run:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
pip install -e "sandcastle/cf_datahive[test]"
|
|
85
|
+
pytest -q sandcastle/cf_datahive/tests
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Published distribution name:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install cf-datahive
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Publishing
|
|
95
|
+
|
|
96
|
+
`cf_datahive` is published with the dedicated Windows workflow:
|
|
97
|
+
|
|
98
|
+
- Workflow: `.github/workflows/cf_datahive_windows_publish.yml`
|
|
99
|
+
- Package directory: `sandcastle/cf_datahive`
|
|
100
|
+
- PyPI tag: `cf-datahive-v<version>`
|
|
101
|
+
- TestPyPI tag: `cf-datahive-v<version>-test`
|
|
102
|
+
|
|
103
|
+
Local preflight:
|
|
104
|
+
|
|
105
|
+
```powershell
|
|
106
|
+
powershell -ExecutionPolicy Bypass -File scripts/mimic_windows_python_publish_workflow.ps1 `
|
|
107
|
+
-WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
|
|
108
|
+
-PackageDir sandcastle/cf_datahive `
|
|
109
|
+
-PythonExe py `
|
|
110
|
+
-PythonVersion 3.13
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Queue a dry-run dispatch:
|
|
114
|
+
|
|
115
|
+
```powershell
|
|
116
|
+
powershell -ExecutionPolicy Bypass -File scripts/queue_windows_python_publish_workflow.ps1 `
|
|
117
|
+
-WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
|
|
118
|
+
-PackageDir sandcastle/cf_datahive `
|
|
119
|
+
-PublishTarget testpypi `
|
|
120
|
+
-Ref main `
|
|
121
|
+
-RequireLocalPass `
|
|
122
|
+
-DryRun
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Do / Don't
|
|
126
|
+
|
|
127
|
+
- Do: use `DataHiveClient` read APIs (`list_runs`, `load_manifest`, `read_table`, `open_artifact`) for inspection and validation.
|
|
128
|
+
- Do: route pipeline write ownership through `cf_datahive_cpp` in the sink path.
|
|
129
|
+
- Don't: write parquet files or artifacts directly into the canonical data hive root from pipeline steps.
|
|
130
|
+
- Don't: bypass manifest updates.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cf-datahive"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Canonical result and measurement data storage APIs for Cogniflow"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pyarrow>=12",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.optional-dependencies]
|
|
16
|
+
pandas = ["pandas>=2.0"]
|
|
17
|
+
test = [
|
|
18
|
+
"pytest>=8.0",
|
|
19
|
+
"pandas>=2.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools]
|
|
23
|
+
package-dir = {"" = "src"}
|
|
24
|
+
include-package-data = true
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.packages.find]
|
|
27
|
+
where = ["src"]
|
|
28
|
+
include = ["cf_datahive*"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Data hive storage client package."""
|
|
2
|
+
|
|
3
|
+
from .client import DataHiveClient, RunHandle, StorageModePolicy
|
|
4
|
+
from .manifest import ArtifactEntry, RunManifest, RunSummary, TableEntry
|
|
5
|
+
from .policy import Policy
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ArtifactEntry",
|
|
9
|
+
"DataHiveClient",
|
|
10
|
+
"Policy",
|
|
11
|
+
"RunHandle",
|
|
12
|
+
"StorageModePolicy",
|
|
13
|
+
"RunManifest",
|
|
14
|
+
"RunSummary",
|
|
15
|
+
"TableEntry",
|
|
16
|
+
]
|