cf-datahive 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: cf-datahive
3
+ Version: 0.1.0
4
+ Summary: Canonical result and measurement data storage APIs for Cogniflow
5
+ Requires-Python: >=3.11
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: pyarrow>=12
8
+ Provides-Extra: pandas
9
+ Requires-Dist: pandas>=2.0; extra == "pandas"
10
+ Provides-Extra: test
11
+ Requires-Dist: pytest>=8.0; extra == "test"
12
+ Requires-Dist: pandas>=2.0; extra == "test"
13
+
14
+ # cf_datahive
15
+
16
+ `cf_datahive` is the Data Hive package boundary for Python-facing APIs/tooling around the canonical data hive root (`workspace/<data_hive>`).
17
+
18
+ ## Boundary (Current Phase)
19
+
20
+ - Python package role (`sandcastle/cf_datahive`): read-oriented API/tooling/validation for pipeline-facing workflows.
21
+ - Native role (`sandcastle/cf_datahive/cpp`): write gatekeeper and only allowed writer under `workspace/data_hive`.
22
+ - Step packages must stay thin wrappers and call the native gatekeeper instead of implementing filesystem/parquet helpers.
23
+
24
+ ## Development workflow
25
+
26
+ - Current development mode is source-first via `scripts/fresh_install.ps1`.
27
+ - The package can now be built and published independently without changing the read/write ownership boundary above.
28
+
29
+ ## Canonical layout
30
+
31
+ ```
32
+ workspace/
33
+ data_hive/
34
+ <pipeline_id>/
35
+ runs/
36
+ <run_id>/
37
+ manifest.json
38
+ tables/
39
+ <table_name>/
40
+ part-0000.parquet
41
+ part-0001.parquet
42
+ artifacts/
43
+ <artifact_name>
44
+ latest.txt
45
+ ```
46
+
47
+ - `latest.txt` stores the committed `run_id` and is updated atomically.
48
+ - `manifest.json` is the SOT for run metadata, table metadata, file hashes, and artifact hashes.
49
+
50
+ ## Usage
51
+
52
+ ```python
53
+ from pathlib import Path
54
+
55
+ from cf_datahive import DataHiveClient
56
+
57
+ workspace_root = Path("workspace")
58
+ client = DataHiveClient(str(workspace_root))
59
+
60
+ runs = client.list_runs("opcua_fifo_avg")
61
+ if runs:
62
+ latest = runs[0].run_id
63
+ manifest = client.load_manifest("opcua_fifo_avg", latest)
64
+ table = client.read_table("opcua_fifo_avg", latest, "measurements")
65
+ print(manifest.status, table.num_rows)
66
+ ```
67
+
68
+ ## Manifest details
69
+
70
+ Each run stores a `RunManifest` (`schema_version="1.0"`) with:
71
+
72
+ - run lifecycle fields (`status`: `staged|committed|aborted`)
73
+ - table entries (`parquet`, schema fingerprint, row/file counts, optional file hashes)
74
+ - artifact entries (sha256, media type, size)
75
+ - optional `semantic_refs` placeholder map for future ontology links
76
+
77
+ Schema fingerprint is sha256 of Arrow schema serialization bytes.
78
+
79
+ ## Guardrails
80
+
81
+ Run the repository guardrail check:
82
+
83
+ ```
84
+ python tools/check_datahive_guardrails.py
85
+ ```
86
+
87
+ The script performs C++/header scans and step-package checks that:
88
+
89
+ - use canonical `workspace/data_hive` literals outside the native gatekeeper location (hard fail)
90
+ - violate the thin-steps rule in `sandcastle/cf_basic_steps/*/src/*/cpp` (hard fail)
91
+
92
+ ## Testing
93
+
94
+ Install test dependencies and run:
95
+
96
+ ```
97
+ pip install -e "sandcastle/cf_datahive[test]"
98
+ pytest -q sandcastle/cf_datahive/tests
99
+ ```
100
+
101
+ Published distribution name:
102
+
103
+ ```bash
104
+ pip install cf-datahive
105
+ ```
106
+
107
+ ## Publishing
108
+
109
+ `cf_datahive` is published with the dedicated Windows workflow:
110
+
111
+ - Workflow: `.github/workflows/cf_datahive_windows_publish.yml`
112
+ - Package directory: `sandcastle/cf_datahive`
113
+ - PyPI tag: `cf-datahive-v<version>`
114
+ - TestPyPI tag: `cf-datahive-v<version>-test`
115
+
116
+ Local preflight:
117
+
118
+ ```powershell
119
+ powershell -ExecutionPolicy Bypass -File scripts/mimic_windows_python_publish_workflow.ps1 `
120
+ -WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
121
+ -PackageDir sandcastle/cf_datahive `
122
+ -PythonExe py `
123
+ -PythonVersion 3.13
124
+ ```
125
+
126
+ Queue a dry-run dispatch:
127
+
128
+ ```powershell
129
+ powershell -ExecutionPolicy Bypass -File scripts/queue_windows_python_publish_workflow.ps1 `
130
+ -WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
131
+ -PackageDir sandcastle/cf_datahive `
132
+ -PublishTarget testpypi `
133
+ -Ref main `
134
+ -RequireLocalPass `
135
+ -DryRun
136
+ ```
137
+
138
+ ## Do / Don't
139
+
140
+ - Do: use `DataHiveClient` read APIs (`list_runs`, `load_manifest`, `read_table`, `open_artifact`) for inspection and validation.
141
+ - Do: route pipeline write ownership through `cf_datahive_cpp` in the sink path.
142
+ - Don't: write parquet files or artifacts directly into the canonical data hive root from pipeline steps.
143
+ - Don't: bypass manifest updates.
@@ -0,0 +1,130 @@
1
+ # cf_datahive
2
+
3
+ `cf_datahive` is the Data Hive package boundary for Python-facing APIs/tooling around the canonical data hive root (`workspace/<data_hive>`).
4
+
5
+ ## Boundary (Current Phase)
6
+
7
+ - Python package role (`sandcastle/cf_datahive`): read-oriented API/tooling/validation for pipeline-facing workflows.
8
+ - Native role (`sandcastle/cf_datahive/cpp`): write gatekeeper and only allowed writer under `workspace/data_hive`.
9
+ - Step packages must stay thin wrappers and call the native gatekeeper instead of implementing filesystem/parquet helpers.
10
+
11
+ ## Development workflow
12
+
13
+ - Current development mode is source-first via `scripts/fresh_install.ps1`.
14
+ - The package can now be built and published independently without changing the read/write ownership boundary above.
15
+
16
+ ## Canonical layout
17
+
18
+ ```
19
+ workspace/
20
+ data_hive/
21
+ <pipeline_id>/
22
+ runs/
23
+ <run_id>/
24
+ manifest.json
25
+ tables/
26
+ <table_name>/
27
+ part-0000.parquet
28
+ part-0001.parquet
29
+ artifacts/
30
+ <artifact_name>
31
+ latest.txt
32
+ ```
33
+
34
+ - `latest.txt` stores the committed `run_id` and is updated atomically.
35
+ - `manifest.json` is the SOT for run metadata, table metadata, file hashes, and artifact hashes.
36
+
37
+ ## Usage
38
+
39
+ ```python
40
+ from pathlib import Path
41
+
42
+ from cf_datahive import DataHiveClient
43
+
44
+ workspace_root = Path("workspace")
45
+ client = DataHiveClient(str(workspace_root))
46
+
47
+ runs = client.list_runs("opcua_fifo_avg")
48
+ if runs:
49
+ latest = runs[0].run_id
50
+ manifest = client.load_manifest("opcua_fifo_avg", latest)
51
+ table = client.read_table("opcua_fifo_avg", latest, "measurements")
52
+ print(manifest.status, table.num_rows)
53
+ ```
54
+
55
+ ## Manifest details
56
+
57
+ Each run stores a `RunManifest` (`schema_version="1.0"`) with:
58
+
59
+ - run lifecycle fields (`status`: `staged|committed|aborted`)
60
+ - table entries (`parquet`, schema fingerprint, row/file counts, optional file hashes)
61
+ - artifact entries (sha256, media type, size)
62
+ - optional `semantic_refs` placeholder map for future ontology links
63
+
64
+ Schema fingerprint is sha256 of Arrow schema serialization bytes.
65
+
66
+ ## Guardrails
67
+
68
+ Run the repository guardrail check:
69
+
70
+ ```
71
+ python tools/check_datahive_guardrails.py
72
+ ```
73
+
74
+ The script performs C++/header scans and step-package checks that:
75
+
76
+ - use canonical `workspace/data_hive` literals outside the native gatekeeper location (hard fail)
77
+ - violate the thin-steps rule in `sandcastle/cf_basic_steps/*/src/*/cpp` (hard fail)
78
+
79
+ ## Testing
80
+
81
+ Install test dependencies and run:
82
+
83
+ ```
84
+ pip install -e "sandcastle/cf_datahive[test]"
85
+ pytest -q sandcastle/cf_datahive/tests
86
+ ```
87
+
88
+ Published distribution name:
89
+
90
+ ```bash
91
+ pip install cf-datahive
92
+ ```
93
+
94
+ ## Publishing
95
+
96
+ `cf_datahive` is published with the dedicated Windows workflow:
97
+
98
+ - Workflow: `.github/workflows/cf_datahive_windows_publish.yml`
99
+ - Package directory: `sandcastle/cf_datahive`
100
+ - PyPI tag: `cf-datahive-v<version>`
101
+ - TestPyPI tag: `cf-datahive-v<version>-test`
102
+
103
+ Local preflight:
104
+
105
+ ```powershell
106
+ powershell -ExecutionPolicy Bypass -File scripts/mimic_windows_python_publish_workflow.ps1 `
107
+ -WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
108
+ -PackageDir sandcastle/cf_datahive `
109
+ -PythonExe py `
110
+ -PythonVersion 3.13
111
+ ```
112
+
113
+ Queue a dry-run dispatch:
114
+
115
+ ```powershell
116
+ powershell -ExecutionPolicy Bypass -File scripts/queue_windows_python_publish_workflow.ps1 `
117
+ -WorkflowFile .github/workflows/cf_datahive_windows_publish.yml `
118
+ -PackageDir sandcastle/cf_datahive `
119
+ -PublishTarget testpypi `
120
+ -Ref main `
121
+ -RequireLocalPass `
122
+ -DryRun
123
+ ```
124
+
125
+ ## Do / Don't
126
+
127
+ - Do: use `DataHiveClient` read APIs (`list_runs`, `load_manifest`, `read_table`, `open_artifact`) for inspection and validation.
128
+ - Do: route pipeline write ownership through `cf_datahive_cpp` in the sink path.
129
+ - Don't: write parquet files or artifacts directly into the canonical data hive root from pipeline steps.
130
+ - Don't: bypass manifest updates.
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "cf-datahive"
7
+ version = "0.1.0"
8
+ description = "Canonical result and measurement data storage APIs for Cogniflow"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "pyarrow>=12",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ pandas = ["pandas>=2.0"]
17
+ test = [
18
+ "pytest>=8.0",
19
+ "pandas>=2.0",
20
+ ]
21
+
22
+ [tool.setuptools]
23
+ package-dir = {"" = "src"}
24
+ include-package-data = true
25
+
26
+ [tool.setuptools.packages.find]
27
+ where = ["src"]
28
+ include = ["cf_datahive*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,16 @@
1
+ """Data hive storage client package."""
2
+
3
+ from .client import DataHiveClient, RunHandle, StorageModePolicy
4
+ from .manifest import ArtifactEntry, RunManifest, RunSummary, TableEntry
5
+ from .policy import Policy
6
+
7
+ __all__ = [
8
+ "ArtifactEntry",
9
+ "DataHiveClient",
10
+ "Policy",
11
+ "RunHandle",
12
+ "StorageModePolicy",
13
+ "RunManifest",
14
+ "RunSummary",
15
+ "TableEntry",
16
+ ]