mesokit-schema 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Gronemeyer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: mesokit-schema
3
+ Version: 0.4.0
4
+ Summary: Shared data contract for the mesofield -> datakit -> databench pipeline.
5
+ Author: Jacob Gronemeyer
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Gronemeyer
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/Gronemeyer/mesokit-schema
29
+ Project-URL: Repository, https://github.com/Gronemeyer/mesokit-schema
30
+ Project-URL: Issues, https://github.com/Gronemeyer/mesokit-schema/issues
31
+ Keywords: neuroscience,schema,pipeline,bids,mesofield
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: Programming Language :: Python :: 3.10
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Classifier: Programming Language :: Python :: 3.13
38
+ Classifier: Topic :: Scientific/Engineering
39
+ Requires-Python: >=3.10
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: pydantic<3.0,>=2.0
43
+ Provides-Extra: test
44
+ Requires-Dist: pytest; extra == "test"
45
+ Dynamic: license-file
46
+
47
+ # mesokit-schema
48
+
49
+ Shared data contract for the **mesofield → datakit → databench** pipeline.
50
+
51
+ This package defines the typed manifests that travel alongside data between the three repos. It is the single source of truth for:
52
+
53
+ - **`AcquisitionManifest`** — what mesofield writes at the end of a session: the list of producers that ran, the files they wrote, calibration constants, time bases, and session identity. Datakit consumes this instead of globbing the filesystem.
54
+ - **`DatasetManifest`** — what datakit writes alongside the dataset table: schema version, datakit version, content hash of the data file, hash of the upstream acquisition manifest, master `TimeBasis`, source-parser versions, declared columns. Databench validates against this on load and concatenates it into provenance.
55
+ - **`AnalysisDeclaration`** — what each databench analysis declares about its inputs and parameters, so a dataset can be validated against an analysis *before* it runs.
56
+
57
+ Plus shared building blocks: `TimeBasis`, `ColumnSpec`, BIDS path helpers, and a single `SCHEMA_VERSION` constant.
58
+
59
+ ## Install
60
+
61
+ From PyPI:
62
+
63
+ ```sh
64
+ pip install mesokit-schema
65
+ ```
66
+
67
+ From GitHub (latest main):
68
+
69
+ ```sh
70
+ pip install git+https://github.com/Gronemeyer/mesokit-schema.git
71
+ ```
72
+
73
+ ### Development
74
+
75
+ For local development across the pipeline, editable-install into all three envs at once:
76
+
77
+ ```sh
78
+ ./scripts/install-dev.sh # all envs
79
+ ./scripts/install-dev.sh mesofield # one env
80
+ ```
81
+
82
+ The script editable-installs `mesokit-schema` plus the env's own package (`mesofield`, `datakit`, `databench`) into each matching conda env. Override layout via `CONDA_ENVS_ROOT`, `DEV_ROOT`, or `DATABENCH_ROOT` env vars.
83
+
84
+ Or install manually:
85
+
86
+ ```sh
87
+ pip install -e /path/to/mesokit-schema
88
+ ```
89
+
90
+ ## Versioning
91
+
92
+ The package exposes `mesokit_schema.SCHEMA_VERSION` (semver). Consumers compare against the `schema_version` field on each manifest and refuse to load anything they don't know how to read. Bump the major version on breaking changes; minor on additive fields.
@@ -0,0 +1,46 @@
1
+ # mesokit-schema
2
+
3
+ Shared data contract for the **mesofield → datakit → databench** pipeline.
4
+
5
+ This package defines the typed manifests that travel alongside data between the three repos. It is the single source of truth for:
6
+
7
+ - **`AcquisitionManifest`** — what mesofield writes at the end of a session: the list of producers that ran, the files they wrote, calibration constants, time bases, and session identity. Datakit consumes this instead of globbing the filesystem.
8
+ - **`DatasetManifest`** — what datakit writes alongside the dataset table: schema version, datakit version, content hash of the data file, hash of the upstream acquisition manifest, master `TimeBasis`, source-parser versions, declared columns. Databench validates against this on load and concatenates it into provenance.
9
+ - **`AnalysisDeclaration`** — what each databench analysis declares about its inputs and parameters, so a dataset can be validated against an analysis *before* it runs.
10
+
11
+ Plus shared building blocks: `TimeBasis`, `ColumnSpec`, BIDS path helpers, and a single `SCHEMA_VERSION` constant.
12
+
13
+ ## Install
14
+
15
+ From PyPI:
16
+
17
+ ```sh
18
+ pip install mesokit-schema
19
+ ```
20
+
21
+ From GitHub (latest main):
22
+
23
+ ```sh
24
+ pip install git+https://github.com/Gronemeyer/mesokit-schema.git
25
+ ```
26
+
27
+ ### Development
28
+
29
+ For local development across the pipeline, editable-install into all three envs at once:
30
+
31
+ ```sh
32
+ ./scripts/install-dev.sh # all envs
33
+ ./scripts/install-dev.sh mesofield # one env
34
+ ```
35
+
36
+ The script editable-installs `mesokit-schema` plus the env's own package (`mesofield`, `datakit`, `databench`) into each matching conda env. Override layout via `CONDA_ENVS_ROOT`, `DEV_ROOT`, or `DATABENCH_ROOT` env vars.
37
+
38
+ Or install manually:
39
+
40
+ ```sh
41
+ pip install -e /path/to/mesokit-schema
42
+ ```
43
+
44
+ ## Versioning
45
+
46
+ The package exposes `mesokit_schema.SCHEMA_VERSION` (semver). Consumers compare against the `schema_version` field on each manifest and refuse to load anything they don't know how to read. Bump the major version on breaking changes; minor on additive fields.
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "mesokit-schema"
7
+ dynamic = ["version"]
8
+ description = "Shared data contract for the mesofield -> datakit -> databench pipeline."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { file = "LICENSE" }
12
+ authors = [{ name = "Jacob Gronemeyer" }]
13
+ keywords = ["neuroscience", "schema", "pipeline", "bids", "mesofield"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Scientific/Engineering",
22
+ ]
23
+
24
+ dependencies = [
25
+ # Upper-bound to protect downstream repos from the next pydantic megaversion.
26
+ # When pydantic 3 lands, lift here only after the migration has been done.
27
+ "pydantic>=2.0,<3.0",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ test = ["pytest"]
32
+
33
+ [tool.setuptools.dynamic]
34
+ version = {attr = "mesokit_schema.version.SCHEMA_VERSION"}
35
+
36
+ [tool.setuptools.packages.find]
37
+ where = ["src"]
38
+
39
+ [project.urls]
40
+ Homepage = "https://github.com/Gronemeyer/mesokit-schema"
41
+ Repository = "https://github.com/Gronemeyer/mesokit-schema"
42
+ Issues = "https://github.com/Gronemeyer/mesokit-schema/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,36 @@
1
+ """Shared data contract for the mesofield -> datakit -> databench pipeline."""
2
+
3
+ from mesokit_schema.analysis import AnalysisDeclaration, ColumnSpec, ParamSpec
4
+ from mesokit_schema.dataset import DatasetManifest, SourceVersion
5
+ from mesokit_schema.manifest import (
6
+ AcquisitionManifest,
7
+ DataqueuePayloadSchema,
8
+ ProducerEntry,
9
+ SessionIdentity,
10
+ SidecarEntry,
11
+ )
12
+ from mesokit_schema.paths import bids_session_dir, bids_stream_filename
13
+ from mesokit_schema.processing import InputRef, ProcessingManifest
14
+ from mesokit_schema.time import TimeBasis
15
+ from mesokit_schema.version import SCHEMA_VERSION
16
+
17
+ __version__ = SCHEMA_VERSION
18
+
19
+ __all__ = [
20
+ "SCHEMA_VERSION",
21
+ "AcquisitionManifest",
22
+ "AnalysisDeclaration",
23
+ "ColumnSpec",
24
+ "DataqueuePayloadSchema",
25
+ "DatasetManifest",
26
+ "InputRef",
27
+ "ParamSpec",
28
+ "ProcessingManifest",
29
+ "ProducerEntry",
30
+ "SessionIdentity",
31
+ "SidecarEntry",
32
+ "SourceVersion",
33
+ "TimeBasis",
34
+ "bids_session_dir",
35
+ "bids_stream_filename",
36
+ ]
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Optional
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+
8
+ class ColumnSpec(BaseModel):
9
+ """A reference to a (source, signal) column on a dataset.
10
+
11
+ Used by analyses to declare their inputs so a dataset can be validated
12
+ against an analysis before it runs.
13
+ """
14
+
15
+ model_config = ConfigDict(extra="forbid")
16
+
17
+ source: str
18
+ signal: str
19
+ role: Optional[str] = None
20
+ unit: Optional[str] = None
21
+ required: bool = True
22
+
23
+
24
+ class ParamSpec(BaseModel):
25
+ """Declares a tunable parameter on an analysis."""
26
+
27
+ model_config = ConfigDict(extra="forbid")
28
+
29
+ name: str
30
+ type: str = Field(description="Python type name as a string, e.g. 'float', 'int', 'str'.")
31
+ default: Any = None
32
+ description: Optional[str] = None
33
+
34
+
35
+ class AnalysisDeclaration(BaseModel):
36
+ """Sibling to AcquisitionManifest/DatasetManifest for the analysis layer.
37
+
38
+ A databench Analysis subclass should expose one of these so the harness can
39
+ check that a dataset satisfies the analysis's required signals before running.
40
+ """
41
+
42
+ model_config = ConfigDict(extra="forbid")
43
+
44
+ name: str
45
+ version: str
46
+ description: Optional[str] = None
47
+ required_signals: list[ColumnSpec] = Field(default_factory=list)
48
+ optional_signals: list[ColumnSpec] = Field(default_factory=list)
49
+ params: list[ParamSpec] = Field(default_factory=list)
50
+ outputs: list[str] = Field(
51
+ default_factory=list,
52
+ description="Names of artifacts produced (e.g. 'plots/bout_histogram.svg').",
53
+ )
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from mesokit_schema.time import TimeBasis
12
+ from mesokit_schema.version import SCHEMA_VERSION
13
+
14
+
15
+ class SourceVersion(BaseModel):
16
+ """Records which parser class (and its version) produced a column group."""
17
+
18
+ model_config = ConfigDict(extra="forbid")
19
+
20
+ tag: str
21
+ version: str
22
+ parser_class: str = Field(description="Fully qualified Python path, e.g. 'datakit.sources.analysis.mesomap.MesoMapSource'.")
23
+
24
+
25
+ class DatasetManifest(BaseModel):
26
+ """Manifest written by datakit alongside the materialized dataset file.
27
+
28
+ Replaces the silent pickle: every dataset is `data.parquet` + `manifest.json`.
29
+ Databench validates the schema version, checks the upstream hash, and reads
30
+ the master `time_basis` and column declarations from here.
31
+ """
32
+
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ schema_version: str = Field(default=SCHEMA_VERSION)
36
+ datakit_version: str
37
+ built_at: datetime
38
+ upstream_acquisition_hash: Optional[str] = Field(
39
+ default=None,
40
+ description="sha256 of the AcquisitionManifest this dataset was built from.",
41
+ )
42
+ data_file: str = Field(description="Relative path to the data file, e.g. 'data.parquet'.")
43
+ data_content_hash: str = Field(description="sha256 of `data_file`.")
44
+ time_basis: TimeBasis
45
+ source_versions: list[SourceVersion] = Field(default_factory=list)
46
+ index_levels: list[str] = Field(
47
+ default_factory=lambda: ["Subject", "Session", "Task"],
48
+ description="MultiIndex level names on the dataset table.",
49
+ )
50
+ columns: list[tuple[str, str]] = Field(
51
+ default_factory=list,
52
+ description="(source, signal) pairs present on the table.",
53
+ )
54
+ extra: dict[str, Any] = Field(default_factory=dict)
55
+
56
+ def to_json(self, *, indent: int = 2) -> str:
57
+ return self.model_dump_json(indent=indent)
58
+
59
+ def write(self, path: str | Path) -> Path:
60
+ path = Path(path)
61
+ path.write_text(self.to_json(), encoding="utf-8")
62
+ return path
63
+
64
+ @classmethod
65
+ def read(cls, path: str | Path) -> "DatasetManifest":
66
+ raw = Path(path).read_text(encoding="utf-8")
67
+ return cls.model_validate_json(raw)
68
+
69
+ def content_hash(self) -> str:
70
+ canonical = json.dumps(
71
+ self.model_dump(mode="json"), sort_keys=True, separators=(",", ":")
72
+ )
73
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
74
+
75
+
76
+ def hash_file(path: str | Path, *, chunk_size: int = 1 << 20) -> str:
77
+ """sha256 of a file's bytes. Use to populate `DatasetManifest.data_content_hash`."""
78
+ h = hashlib.sha256()
79
+ with open(path, "rb") as fh:
80
+ while True:
81
+ chunk = fh.read(chunk_size)
82
+ if not chunk:
83
+ break
84
+ h.update(chunk)
85
+ return h.hexdigest()
@@ -0,0 +1,167 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Literal, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from mesokit_schema.time import TimeBasis
12
+ from mesokit_schema.version import SCHEMA_VERSION
13
+
14
+
15
+ class SessionIdentity(BaseModel):
16
+ """The (subject, session, task) tuple that downstream uses as a primary key."""
17
+
18
+ model_config = ConfigDict(extra="forbid")
19
+
20
+ subject: str
21
+ session: str
22
+ task: Optional[str] = None
23
+ experimenter: Optional[str] = None
24
+ protocol: Optional[str] = None
25
+
26
+
27
+ class DataqueuePayloadSchema(BaseModel):
28
+ """Typed contract for finding & decoding a producer's dataqueue rows.
29
+
30
+ Mesofield writes one shared `dataqueue.csv` per session with one row
31
+ per `record()` call across all producers; each row carries a
32
+ `device_id` and a `payload` column. The alignment-needing parsers
33
+ (treadmill, wheel) today regex-match against payload strings and
34
+ substring-search `device_id` to locate "their" producer's rows.
35
+ Those magic strings live in parser code with no formal link back
36
+ to what the producer actually pushed.
37
+
38
+ `DataqueuePayloadSchema` is the typed link. The producer declares
39
+ its `device_id` (as it appears in the dataqueue) and its payload
40
+ shape once; the parser reads this off `ProducerEntry.dataqueue_schema`
41
+ at ingest time instead of carrying its own regex.
42
+
43
+ Three payload shapes are supported:
44
+ - `"scalar"`: payload column holds one value per row (the wheel
45
+ encoder's int click count, for example). `payload_fields`
46
+ is empty.
47
+ - `"dict"`: payload column holds a dict-repr (treadmill's
48
+ `{distance, speed, device_us}`). `payload_fields` maps each
49
+ field name to a Python type literal (e.g. `"float"`, `"int"`).
50
+ - `"tagged_string"`: payload column holds a key=value string like
51
+ `"EncoderData timestamp=1234"`. `payload_fields` maps each
52
+ field name to the regex extracting it.
53
+ """
54
+
55
+ model_config = ConfigDict(extra="forbid")
56
+
57
+ device_id: str = Field(
58
+ description="Value the producer pushes under the dataqueue's device_id column.",
59
+ )
60
+ payload_format: Literal["scalar", "dict", "tagged_string"] = Field(
61
+ default="scalar",
62
+ description="How to decode the payload column.",
63
+ )
64
+ payload_fields: dict[str, str] = Field(
65
+ default_factory=dict,
66
+ description="Per-field type or regex; semantics depend on payload_format.",
67
+ )
68
+ description: Optional[str] = None
69
+
70
+
71
+ class SidecarEntry(BaseModel):
72
+ """An auxiliary file declared by a producer next to its primary output.
73
+
74
+ Producers expose sidecars so consumers can dispatch on `role` instead of
75
+ file-name globbing. Examples: a camera's per-frame metadata JSON, a
76
+ mesomap mask, a regions table.
77
+ """
78
+
79
+ model_config = ConfigDict(extra="forbid")
80
+
81
+ path: str = Field(description="Sidecar path relative to the session root.")
82
+ role: str = Field(
83
+ description="What this sidecar is for, e.g. 'frame_metadata', 'mask', 'regions'."
84
+ )
85
+ schema_version: Optional[str] = Field(
86
+ default=None,
87
+ description="Producer-declared version of the sidecar's own format.",
88
+ )
89
+ description: Optional[str] = None
90
+
91
+
92
+ class ProducerEntry(BaseModel):
93
+ """One entry per `DataProducer` that ran during the acquisition.
94
+
95
+ Replaces datakit's glob-based discovery: each entry declares exactly
96
+ which file was written, what type it is, and how to interpret its
97
+ timestamps.
98
+ """
99
+
100
+ model_config = ConfigDict(extra="forbid")
101
+
102
+ device_id: str
103
+ device_type: str
104
+ data_type: str = Field(description="Free-form tag matching datakit SOURCE_REGISTRY key.")
105
+ bids_type: Optional[str] = None
106
+ file_type: str = Field(description="e.g. 'ome.tiff', 'csv', 'json', 'mp4'.")
107
+ output_path: str = Field(description="Path relative to the session root.")
108
+ metadata_path: Optional[str] = Field(
109
+ default=None,
110
+ description="Primary metadata sidecar (e.g. camera frame metadata), relative to the session root.",
111
+ )
112
+ sampling_rate_hz: Optional[float] = None
113
+ time_basis: TimeBasis
114
+ calibration: dict[str, Any] = Field(
115
+ default_factory=dict,
116
+ description="Device-specific constants (wheel CPR/diameter, pixel size, etc.).",
117
+ )
118
+ sidecars: list[SidecarEntry] = Field(
119
+ default_factory=list,
120
+ description="Additional declared sidecars beyond `metadata_path`.",
121
+ )
122
+ dataqueue_schema: Optional[DataqueuePayloadSchema] = Field(
123
+ default=None,
124
+ description="If the producer pushes rows to the session dataqueue, the "
125
+ "typed contract for how a parser finds and decodes them.",
126
+ )
127
+
128
+
129
+ class AcquisitionManifest(BaseModel):
130
+ """Top-level manifest written by mesofield at session shutdown.
131
+
132
+ A complete acquisition produces exactly one of these on disk
133
+ (`manifest.json`, alongside the data files). Datakit ingests by reading
134
+ this manifest instead of globbing.
135
+ """
136
+
137
+ model_config = ConfigDict(extra="forbid")
138
+
139
+ schema_version: str = Field(default=SCHEMA_VERSION)
140
+ mesofield_version: str
141
+ acquisition_complete: bool
142
+ started_at: datetime
143
+ ended_at: Optional[datetime] = None
144
+ session: SessionIdentity
145
+ producers: list[ProducerEntry]
146
+ notes: Optional[str] = None
147
+ extra: dict[str, Any] = Field(default_factory=dict)
148
+
149
+ def to_json(self, *, indent: int = 2) -> str:
150
+ return self.model_dump_json(indent=indent)
151
+
152
+ def write(self, path: str | Path) -> Path:
153
+ path = Path(path)
154
+ path.write_text(self.to_json(), encoding="utf-8")
155
+ return path
156
+
157
+ @classmethod
158
+ def read(cls, path: str | Path) -> "AcquisitionManifest":
159
+ raw = Path(path).read_text(encoding="utf-8")
160
+ return cls.model_validate_json(raw)
161
+
162
+ def content_hash(self) -> str:
163
+ """Stable sha256 of the canonical JSON form. Used to seed provenance downstream."""
164
+ canonical = json.dumps(
165
+ self.model_dump(mode="json"), sort_keys=True, separators=(",", ":")
166
+ )
167
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
@@ -0,0 +1,41 @@
1
+ """BIDS path construction helpers, shared across producer and consumer.
2
+
3
+ Mesofield's `ExperimentConfig.make_path()` and datakit's path parsing both
4
+ encode the same convention. Putting the helpers here removes that duplication.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+
13
+ def bids_session_dir(root: str | Path, subject: str, session: str) -> Path:
14
+ """Return `<root>/data/sub-<subject>/ses-<session>/`."""
15
+ return Path(root) / "data" / f"sub-{subject}" / f"ses-{session}"
16
+
17
+
18
+ def bids_stream_filename(
19
+ *,
20
+ subject: str,
21
+ session: str,
22
+ task: Optional[str],
23
+ suffix: str,
24
+ extension: str,
25
+ timestamp: Optional[str] = None,
26
+ ) -> str:
27
+ """Construct the canonical BIDS-style filename used across the pipeline.
28
+
29
+ Example: `251114_103045_sub-001_ses-1_task-runA_meso.ome.tiff`.
30
+ """
31
+ parts: list[str] = []
32
+ if timestamp:
33
+ parts.append(timestamp)
34
+ parts.append(f"sub-{subject}")
35
+ parts.append(f"ses-{session}")
36
+ if task:
37
+ parts.append(f"task-{task}")
38
+ parts.append(suffix)
39
+ stem = "_".join(parts)
40
+ ext = extension.lstrip(".")
41
+ return f"{stem}.{ext}"
@@ -0,0 +1,103 @@
1
+ """Manifest contract for intermediate processing stages (DLC, mesomap, etc.).
2
+
3
+ Mesofield writes raw acquisitions and emits `AcquisitionManifest`. Datakit
4
+ consumes those. But many real pipelines have a *middle stage* — DeepLabCut
5
+ runs on a pupil video, mesomap registers a widefield tiff to an atlas, a
6
+ spike sorter chews on a recording — that produces derived files which
7
+ datakit then ingests as if they were primary outputs.
8
+
9
+ Those middle stages have historically lived outside the contract: outputs
10
+ landed in `processed/`, datakit globbed for them, and there was no record
11
+ of which model / atlas / parameters produced the file. A re-run with
12
+ different parameters silently overwrote the previous output.
13
+
14
+ `ProcessingManifest` closes that gap. Each processor emits a sidecar
15
+ (`<tool_name>.process.json`) next to its outputs describing:
16
+
17
+ - the tool that ran (name, version, invocation)
18
+ - the inputs it consumed (path + sha256 content hash)
19
+ - the parameters it ran with
20
+ - the outputs it wrote (as `ProducerEntry` instances, same shape as raw
21
+ acquisition entries)
22
+ - the upstream `AcquisitionManifest` hash, so the provenance chain extends
23
+ across the entire pipeline.
24
+
25
+ `mesofield.processing.ProcessorRunner` is the helper that wraps any
26
+ processing function and emits the sidecar automatically.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import hashlib
32
+ import json
33
+ from datetime import datetime
34
+ from pathlib import Path
35
+ from typing import Any, Optional
36
+
37
+ from pydantic import BaseModel, ConfigDict, Field
38
+
39
+ from mesokit_schema.manifest import ProducerEntry
40
+ from mesokit_schema.version import SCHEMA_VERSION
41
+
42
+
43
+ class InputRef(BaseModel):
44
+ """A processing input recorded by content hash, not just path."""
45
+
46
+ model_config = ConfigDict(extra="forbid")
47
+
48
+ path: str = Field(description="Absolute or session-relative path at processing time.")
49
+ content_hash: str = Field(description="sha256 of the input file's bytes.")
50
+ description: Optional[str] = None
51
+
52
+
53
+ class ProcessingManifest(BaseModel):
54
+ """Manifest emitted by each intermediate processing stage.
55
+
56
+ Lives in the session's `processed/` directory as
57
+ `<tool_name>.process.json` (or a tool-specific subdirectory + filename
58
+ if the processor prefers).
59
+ """
60
+
61
+ model_config = ConfigDict(extra="forbid")
62
+
63
+ schema_version: str = Field(default=SCHEMA_VERSION)
64
+ tool_name: str = Field(description="Stable identifier, e.g. 'mesomap', 'deeplabcut'.")
65
+ tool_version: str
66
+ tool_invocation: str = Field(
67
+ description="Human-readable invocation: command line or function call."
68
+ )
69
+ built_at: datetime
70
+ upstream_acquisition_hash: Optional[str] = Field(
71
+ default=None,
72
+ description="sha256 of the AcquisitionManifest this processing run was rooted at.",
73
+ )
74
+ inputs: list[InputRef] = Field(default_factory=list)
75
+ parameters: dict[str, Any] = Field(
76
+ default_factory=dict,
77
+ description="Free-form: model snapshot, atlas version, thresholds, ...",
78
+ )
79
+ outputs: list[ProducerEntry] = Field(
80
+ default_factory=list,
81
+ description="Each derived file declared as a producer entry, same shape "
82
+ "as the AcquisitionManifest's `producers`.",
83
+ )
84
+ extra: dict[str, Any] = Field(default_factory=dict)
85
+
86
+ def to_json(self, *, indent: int = 2) -> str:
87
+ return self.model_dump_json(indent=indent)
88
+
89
+ def write(self, path: str | Path) -> Path:
90
+ path = Path(path)
91
+ path.write_text(self.to_json(), encoding="utf-8")
92
+ return path
93
+
94
+ @classmethod
95
+ def read(cls, path: str | Path) -> "ProcessingManifest":
96
+ raw = Path(path).read_text(encoding="utf-8")
97
+ return cls.model_validate_json(raw)
98
+
99
+ def content_hash(self) -> str:
100
+ canonical = json.dumps(
101
+ self.model_dump(mode="json"), sort_keys=True, separators=(",", ":")
102
+ )
103
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal, Optional
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+ ClockSource = Literal[
8
+ "wall_unix_s",
9
+ "micromanager_core",
10
+ "monotonic_s",
11
+ "hardware_ttl",
12
+ "derived",
13
+ ]
14
+
15
+
16
+ class TimeBasis(BaseModel):
17
+ """Declares how a stream's timestamps relate to a real-world clock.
18
+
19
+ A `TimeBasis` is attached per-producer in `AcquisitionManifest` and once
20
+ more as the master timeline in `DatasetManifest`. The downstream consumer
21
+ uses it to align streams deterministically instead of guessing from
22
+ column names.
23
+ """
24
+
25
+ model_config = ConfigDict(extra="forbid")
26
+
27
+ clock_source: ClockSource
28
+ units: Literal["seconds"] = "seconds"
29
+ epoch_unix_s: Optional[float] = Field(
30
+ default=None,
31
+ description="If timestamps are relative, the absolute unix time of t=0.",
32
+ )
33
+ known_offset_s: float = Field(
34
+ default=0.0,
35
+ description="Static offset to add to recorded timestamps to align with master.",
36
+ )
37
+ description: Optional[str] = None
@@ -0,0 +1 @@
1
+ SCHEMA_VERSION = "0.4.0"
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: mesokit-schema
3
+ Version: 0.4.0
4
+ Summary: Shared data contract for the mesofield -> datakit -> databench pipeline.
5
+ Author: Jacob Gronemeyer
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Gronemeyer
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/Gronemeyer/mesokit-schema
29
+ Project-URL: Repository, https://github.com/Gronemeyer/mesokit-schema
30
+ Project-URL: Issues, https://github.com/Gronemeyer/mesokit-schema/issues
31
+ Keywords: neuroscience,schema,pipeline,bids,mesofield
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: Programming Language :: Python :: 3.10
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Classifier: Programming Language :: Python :: 3.13
38
+ Classifier: Topic :: Scientific/Engineering
39
+ Requires-Python: >=3.10
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: pydantic<3.0,>=2.0
43
+ Provides-Extra: test
44
+ Requires-Dist: pytest; extra == "test"
45
+ Dynamic: license-file
46
+
47
+ # mesokit-schema
48
+
49
+ Shared data contract for the **mesofield → datakit → databench** pipeline.
50
+
51
+ This package defines the typed manifests that travel alongside data between the three repos. It is the single source of truth for:
52
+
53
+ - **`AcquisitionManifest`** — what mesofield writes at the end of a session: the list of producers that ran, the files they wrote, calibration constants, time bases, and session identity. Datakit consumes this instead of globbing the filesystem.
54
+ - **`DatasetManifest`** — what datakit writes alongside the dataset table: schema version, datakit version, content hash of the data file, hash of the upstream acquisition manifest, master `TimeBasis`, source-parser versions, declared columns. Databench validates against this on load and concatenates it into provenance.
55
+ - **`AnalysisDeclaration`** — what each databench analysis declares about its inputs and parameters, so a dataset can be validated against an analysis *before* it runs.
56
+
57
+ Plus shared building blocks: `TimeBasis`, `ColumnSpec`, BIDS path helpers, and a single `SCHEMA_VERSION` constant.
58
+
59
+ ## Install
60
+
61
+ From PyPI:
62
+
63
+ ```sh
64
+ pip install mesokit-schema
65
+ ```
66
+
67
+ From GitHub (latest main):
68
+
69
+ ```sh
70
+ pip install git+https://github.com/Gronemeyer/mesokit-schema.git
71
+ ```
72
+
73
+ ### Development
74
+
75
+ For local development across the pipeline, editable-install into all three envs at once:
76
+
77
+ ```sh
78
+ ./scripts/install-dev.sh # all envs
79
+ ./scripts/install-dev.sh mesofield # one env
80
+ ```
81
+
82
+ The script editable-installs `mesokit-schema` plus the env's own package (`mesofield`, `datakit`, `databench`) into each matching conda env. Override layout via `CONDA_ENVS_ROOT`, `DEV_ROOT`, or `DATABENCH_ROOT` env vars.
83
+
84
+ Or install manually:
85
+
86
+ ```sh
87
+ pip install -e /path/to/mesokit-schema
88
+ ```
89
+
90
+ ## Versioning
91
+
92
+ The package exposes `mesokit_schema.SCHEMA_VERSION` (semver). Consumers compare against the `schema_version` field on each manifest and refuse to load anything they don't know how to read. Bump the major version on breaking changes; minor on additive fields.
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/mesokit_schema/__init__.py
5
+ src/mesokit_schema/analysis.py
6
+ src/mesokit_schema/dataset.py
7
+ src/mesokit_schema/manifest.py
8
+ src/mesokit_schema/paths.py
9
+ src/mesokit_schema/processing.py
10
+ src/mesokit_schema/time.py
11
+ src/mesokit_schema/version.py
12
+ src/mesokit_schema.egg-info/PKG-INFO
13
+ src/mesokit_schema.egg-info/SOURCES.txt
14
+ src/mesokit_schema.egg-info/dependency_links.txt
15
+ src/mesokit_schema.egg-info/requires.txt
16
+ src/mesokit_schema.egg-info/top_level.txt
17
+ tests/test_roundtrip.py
@@ -0,0 +1,4 @@
1
+ pydantic<3.0,>=2.0
2
+
3
+ [test]
4
+ pytest
@@ -0,0 +1 @@
1
+ mesokit_schema
@@ -0,0 +1,142 @@
1
+ from datetime import datetime, timezone
2
+ from pathlib import Path
3
+
4
+ from mesokit_schema import (
5
+ SCHEMA_VERSION,
6
+ AcquisitionManifest,
7
+ AnalysisDeclaration,
8
+ ColumnSpec,
9
+ DatasetManifest,
10
+ InputRef,
11
+ ProcessingManifest,
12
+ ProducerEntry,
13
+ SessionIdentity,
14
+ SourceVersion,
15
+ TimeBasis,
16
+ bids_session_dir,
17
+ bids_stream_filename,
18
+ )
19
+
20
+
21
+ def _sample_acquisition() -> AcquisitionManifest:
22
+ return AcquisitionManifest(
23
+ mesofield_version="0.0.1",
24
+ acquisition_complete=True,
25
+ started_at=datetime(2026, 5, 14, 10, 0, tzinfo=timezone.utc),
26
+ session=SessionIdentity(subject="001", session="1", task="runA"),
27
+ producers=[
28
+ ProducerEntry(
29
+ device_id="meso",
30
+ device_type="MMCamera",
31
+ data_type="meso_metadata",
32
+ bids_type="func",
33
+ file_type="ome.tiff",
34
+ output_path="data/sub-001/ses-1/func/meso.ome.tiff",
35
+ metadata_path="data/sub-001/ses-1/func/meso.ome.tiff_frame_metadata.json",
36
+ sampling_rate_hz=30.0,
37
+ time_basis=TimeBasis(clock_source="micromanager_core"),
38
+ ),
39
+ ProducerEntry(
40
+ device_id="wheel",
41
+ device_type="SerialWorker",
42
+ data_type="wheel",
43
+ file_type="csv",
44
+ output_path="data/sub-001/ses-1/beh/wheel.csv",
45
+ time_basis=TimeBasis(clock_source="wall_unix_s"),
46
+ calibration={"cpr": 360, "diameter_mm": 152.4},
47
+ ),
48
+ ],
49
+ )
50
+
51
+
52
+ def test_acquisition_manifest_roundtrip(tmp_path: Path) -> None:
53
+ manifest = _sample_acquisition()
54
+ out = tmp_path / "manifest.json"
55
+ manifest.write(out)
56
+ loaded = AcquisitionManifest.read(out)
57
+ assert loaded == manifest
58
+ assert loaded.schema_version == SCHEMA_VERSION
59
+ assert loaded.content_hash() == manifest.content_hash()
60
+
61
+
62
+ def test_dataset_manifest_roundtrip(tmp_path: Path) -> None:
63
+ upstream = _sample_acquisition()
64
+ ds = DatasetManifest(
65
+ datakit_version="2025.11.22",
66
+ built_at=datetime(2026, 5, 14, 11, 0, tzinfo=timezone.utc),
67
+ upstream_acquisition_hash=upstream.content_hash(),
68
+ data_file="data.parquet",
69
+ data_content_hash="0" * 64,
70
+ time_basis=TimeBasis(clock_source="derived", description="datakit master timeline"),
71
+ source_versions=[
72
+ SourceVersion(
73
+ tag="mesomap",
74
+ version="0.3.0",
75
+ parser_class="datakit.sources.analysis.mesomap.MesoMapSource",
76
+ ),
77
+ ],
78
+ columns=[("mesomap", "L_VISp"), ("wheel", "speed_mm")],
79
+ )
80
+ out = tmp_path / "dataset_manifest.json"
81
+ ds.write(out)
82
+ loaded = DatasetManifest.read(out)
83
+ assert loaded == ds
84
+ assert loaded.upstream_acquisition_hash == upstream.content_hash()
85
+
86
+
87
+ def test_processing_manifest_roundtrip(tmp_path: Path) -> None:
88
+ upstream = _sample_acquisition()
89
+ proc_manifest = ProcessingManifest(
90
+ tool_name="mock_regional_means",
91
+ tool_version="0.1.0",
92
+ tool_invocation="MockRegionalMeans(n_regions=4).run([tiff_path])",
93
+ built_at=datetime(2026, 5, 14, 12, 0, tzinfo=timezone.utc),
94
+ upstream_acquisition_hash=upstream.content_hash(),
95
+ inputs=[
96
+ InputRef(path="func/meso.ome.tiff", content_hash="a" * 64),
97
+ ],
98
+ parameters={"n_regions": 4, "baseline_frames": 10},
99
+ outputs=[
100
+ ProducerEntry(
101
+ device_id="mock_regional_means",
102
+ device_type="processor",
103
+ data_type="regional_means",
104
+ bids_type="func",
105
+ file_type="csv",
106
+ output_path="processed/regional_means.csv",
107
+ time_basis=TimeBasis(clock_source="derived"),
108
+ ),
109
+ ],
110
+ )
111
+ out = tmp_path / "regional_means.process.json"
112
+ proc_manifest.write(out)
113
+ loaded = ProcessingManifest.read(out)
114
+ assert loaded == proc_manifest
115
+ assert loaded.schema_version == SCHEMA_VERSION
116
+ assert loaded.upstream_acquisition_hash == upstream.content_hash()
117
+ assert len(loaded.outputs) == 1
118
+ assert loaded.outputs[0].data_type == "regional_means"
119
+
120
+
121
+ def test_analysis_declaration() -> None:
122
+ decl = AnalysisDeclaration(
123
+ name="locomotion_bouts",
124
+ version="0.1.0",
125
+ required_signals=[ColumnSpec(source="wheel", signal="speed_mm", unit="mm/s")],
126
+ )
127
+ assert decl.required_signals[0].required is True
128
+
129
+
130
+ def test_paths() -> None:
131
+ assert bids_session_dir("/x", "001", "1") == Path("/x/data/sub-001/ses-1")
132
+ assert (
133
+ bids_stream_filename(
134
+ subject="001",
135
+ session="1",
136
+ task="runA",
137
+ suffix="meso",
138
+ extension="ome.tiff",
139
+ timestamp="251114_103045",
140
+ )
141
+ == "251114_103045_sub-001_ses-1_task-runA_meso.ome.tiff"
142
+ )