PyPI - mesokit-schema - Versions diffs - 0.4.0__tar.gz - Mend

mesokit-schema 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

mesokit_schema-0.4.0/LICENSE +21 -0
mesokit_schema-0.4.0/PKG-INFO +92 -0
mesokit_schema-0.4.0/README.md +46 -0
mesokit_schema-0.4.0/pyproject.toml +42 -0
mesokit_schema-0.4.0/setup.cfg +4 -0
mesokit_schema-0.4.0/src/mesokit_schema/__init__.py +36 -0
mesokit_schema-0.4.0/src/mesokit_schema/analysis.py +53 -0
mesokit_schema-0.4.0/src/mesokit_schema/dataset.py +85 -0
mesokit_schema-0.4.0/src/mesokit_schema/manifest.py +167 -0
mesokit_schema-0.4.0/src/mesokit_schema/paths.py +41 -0
mesokit_schema-0.4.0/src/mesokit_schema/processing.py +103 -0
mesokit_schema-0.4.0/src/mesokit_schema/time.py +37 -0
mesokit_schema-0.4.0/src/mesokit_schema/version.py +1 -0
mesokit_schema-0.4.0/src/mesokit_schema.egg-info/PKG-INFO +92 -0
mesokit_schema-0.4.0/src/mesokit_schema.egg-info/SOURCES.txt +17 -0
mesokit_schema-0.4.0/src/mesokit_schema.egg-info/dependency_links.txt +1 -0
mesokit_schema-0.4.0/src/mesokit_schema.egg-info/requires.txt +4 -0
mesokit_schema-0.4.0/src/mesokit_schema.egg-info/top_level.txt +1 -0
mesokit_schema-0.4.0/tests/test_roundtrip.py +142 -0

mesokit_schema-0.4.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Gronemeyer
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

mesokit_schema-0.4.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,92 @@
+Metadata-Version: 2.4
+Name: mesokit-schema
+Version: 0.4.0
+Summary: Shared data contract for the mesofield -> datakit -> databench pipeline.
+Author: Jacob Gronemeyer
+License: MIT License
+        Copyright (c) 2026 Gronemeyer
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/Gronemeyer/mesokit-schema
+Project-URL: Repository, https://github.com/Gronemeyer/mesokit-schema
+Project-URL: Issues, https://github.com/Gronemeyer/mesokit-schema/issues
+Keywords: neuroscience,schema,pipeline,bids,mesofield
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pydantic<3.0,>=2.0
+Provides-Extra: test
+Requires-Dist: pytest; extra == "test"
+Dynamic: license-file
+# mesokit-schema
+Shared data contract for the **mesofield → datakit → databench** pipeline.
+This package defines the typed manifests that travel alongside data between the three repos. It is the single source of truth for:
+- **`AcquisitionManifest`** — what mesofield writes at the end of a session: the list of producers that ran, the files they wrote, calibration constants, time bases, and session identity. Datakit consumes this instead of globbing the filesystem.
+- **`DatasetManifest`** — what datakit writes alongside the dataset table: schema version, datakit version, content hash of the data file, hash of the upstream acquisition manifest, master `TimeBasis`, source-parser versions, declared columns. Databench validates against this on load and concatenates it into provenance.
+- **`AnalysisDeclaration`** — what each databench analysis declares about its inputs and parameters, so a dataset can be validated against an analysis *before* it runs.
+Plus shared building blocks: `TimeBasis`, `ColumnSpec`, BIDS path helpers, and a single `SCHEMA_VERSION` constant.
+## Install
+From PyPI:
+```sh
+pip install mesokit-schema
+```
+From GitHub (latest main):
+```sh
+pip install git+https://github.com/Gronemeyer/mesokit-schema.git
+```
+### Development
+For local development across the pipeline, editable-install into all three envs at once:
+```sh
+./scripts/install-dev.sh                  # all envs
+./scripts/install-dev.sh mesofield        # one env
+```
+The script editable-installs `mesokit-schema` plus the env's own package (`mesofield`, `datakit`, `databench`) into each matching conda env. Override layout via `CONDA_ENVS_ROOT`, `DEV_ROOT`, or `DATABENCH_ROOT` env vars.
+Or install manually:
+```sh
+pip install -e /path/to/mesokit-schema
+```
+## Versioning
+The package exposes `mesokit_schema.SCHEMA_VERSION` (semver). Consumers compare against the `schema_version` field on each manifest and refuse to load anything they don't know how to read. Bump the major version on breaking changes; minor on additive fields.

mesokit_schema-0.4.0/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# mesokit-schema
+Shared data contract for the **mesofield → datakit → databench** pipeline.
+This package defines the typed manifests that travel alongside data between the three repos. It is the single source of truth for:
+- **`AcquisitionManifest`** — what mesofield writes at the end of a session: the list of producers that ran, the files they wrote, calibration constants, time bases, and session identity. Datakit consumes this instead of globbing the filesystem.
+- **`DatasetManifest`** — what datakit writes alongside the dataset table: schema version, datakit version, content hash of the data file, hash of the upstream acquisition manifest, master `TimeBasis`, source-parser versions, declared columns. Databench validates against this on load and concatenates it into provenance.
+- **`AnalysisDeclaration`** — what each databench analysis declares about its inputs and parameters, so a dataset can be validated against an analysis *before* it runs.
+Plus shared building blocks: `TimeBasis`, `ColumnSpec`, BIDS path helpers, and a single `SCHEMA_VERSION` constant.
+## Install
+From PyPI:
+```sh
+pip install mesokit-schema
+```
+From GitHub (latest main):
+```sh
+pip install git+https://github.com/Gronemeyer/mesokit-schema.git
+```
+### Development
+For local development across the pipeline, editable-install into all three envs at once:
+```sh
+./scripts/install-dev.sh                  # all envs
+./scripts/install-dev.sh mesofield        # one env
+```
+The script editable-installs `mesokit-schema` plus the env's own package (`mesofield`, `datakit`, `databench`) into each matching conda env. Override layout via `CONDA_ENVS_ROOT`, `DEV_ROOT`, or `DATABENCH_ROOT` env vars.
+Or install manually:
+```sh
+pip install -e /path/to/mesokit-schema
+```
+## Versioning
+The package exposes `mesokit_schema.SCHEMA_VERSION` (semver). Consumers compare against the `schema_version` field on each manifest and refuse to load anything they don't know how to read. Bump the major version on breaking changes; minor on additive fields.

mesokit_schema-0.4.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,42 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "mesokit-schema"
+dynamic = ["version"]
+description = "Shared data contract for the mesofield -> datakit -> databench pipeline."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+authors = [{ name = "Jacob Gronemeyer" }]
+keywords = ["neuroscience", "schema", "pipeline", "bids", "mesofield"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering",
+]
+dependencies = [
+    # Upper-bound to protect downstream repos from the next pydantic megaversion.
+    # When pydantic 3 lands, lift here only after the migration has been done.
+    "pydantic>=2.0,<3.0",
+]
+[project.optional-dependencies]
+test = ["pytest"]
+[tool.setuptools.dynamic]
+version = {attr = "mesokit_schema.version.SCHEMA_VERSION"}
+[tool.setuptools.packages.find]
+where = ["src"]
+[project.urls]
+Homepage = "https://github.com/Gronemeyer/mesokit-schema"
+Repository = "https://github.com/Gronemeyer/mesokit-schema"
+Issues = "https://github.com/Gronemeyer/mesokit-schema/issues"

mesokit_schema-0.4.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

mesokit_schema-0.4.0/src/mesokit_schema/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Shared data contract for the mesofield -> datakit -> databench pipeline."""
+from mesokit_schema.analysis import AnalysisDeclaration, ColumnSpec, ParamSpec
+from mesokit_schema.dataset import DatasetManifest, SourceVersion
+from mesokit_schema.manifest import (
+    AcquisitionManifest,
+    DataqueuePayloadSchema,
+    ProducerEntry,
+    SessionIdentity,
+    SidecarEntry,
+)
+from mesokit_schema.paths import bids_session_dir, bids_stream_filename
+from mesokit_schema.processing import InputRef, ProcessingManifest
+from mesokit_schema.time import TimeBasis
+from mesokit_schema.version import SCHEMA_VERSION
+__version__ = SCHEMA_VERSION
+__all__ = [
+    "SCHEMA_VERSION",
+    "AcquisitionManifest",
+    "AnalysisDeclaration",
+    "ColumnSpec",
+    "DataqueuePayloadSchema",
+    "DatasetManifest",
+    "InputRef",
+    "ParamSpec",
+    "ProcessingManifest",
+    "ProducerEntry",
+    "SessionIdentity",
+    "SidecarEntry",
+    "SourceVersion",
+    "TimeBasis",
+    "bids_session_dir",
+    "bids_stream_filename",
+]

mesokit_schema-0.4.0/src/mesokit_schema/analysis.py ADDED Viewed

@@ -0,0 +1,53 @@
+from __future__ import annotations
+from typing import Any, Optional
+from pydantic import BaseModel, ConfigDict, Field
+class ColumnSpec(BaseModel):
+    """A reference to a (source, signal) column on a dataset.
+    Used by analyses to declare their inputs so a dataset can be validated
+    against an analysis before it runs.
+    """
+    model_config = ConfigDict(extra="forbid")
+    source: str
+    signal: str
+    role: Optional[str] = None
+    unit: Optional[str] = None
+    required: bool = True
+class ParamSpec(BaseModel):
+    """Declares a tunable parameter on an analysis."""
+    model_config = ConfigDict(extra="forbid")
+    name: str
+    type: str = Field(description="Python type name as a string, e.g. 'float', 'int', 'str'.")
+    default: Any = None
+    description: Optional[str] = None
+class AnalysisDeclaration(BaseModel):
+    """Sibling to AcquisitionManifest/DatasetManifest for the analysis layer.
+    A databench Analysis subclass should expose one of these so the harness can
+    check that a dataset satisfies the analysis's required signals before running.
+    """
+    model_config = ConfigDict(extra="forbid")
+    name: str
+    version: str
+    description: Optional[str] = None
+    required_signals: list[ColumnSpec] = Field(default_factory=list)
+    optional_signals: list[ColumnSpec] = Field(default_factory=list)
+    params: list[ParamSpec] = Field(default_factory=list)
+    outputs: list[str] = Field(
+        default_factory=list,
+        description="Names of artifacts produced (e.g. 'plots/bout_histogram.svg').",
+    )

mesokit_schema-0.4.0/src/mesokit_schema/dataset.py ADDED Viewed

@@ -0,0 +1,85 @@
+from __future__ import annotations
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from mesokit_schema.time import TimeBasis
+from mesokit_schema.version import SCHEMA_VERSION
+class SourceVersion(BaseModel):
+    """Records which parser class (and its version) produced a column group."""
+    model_config = ConfigDict(extra="forbid")
+    tag: str
+    version: str
+    parser_class: str = Field(description="Fully qualified Python path, e.g. 'datakit.sources.analysis.mesomap.MesoMapSource'.")
+class DatasetManifest(BaseModel):
+    """Manifest written by datakit alongside the materialized dataset file.
+    Replaces the silent pickle: every dataset is `data.parquet` + `manifest.json`.
+    Databench validates the schema version, checks the upstream hash, and reads
+    the master `time_basis` and column declarations from here.
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: str = Field(default=SCHEMA_VERSION)
+    datakit_version: str
+    built_at: datetime
+    upstream_acquisition_hash: Optional[str] = Field(
+        default=None,
+        description="sha256 of the AcquisitionManifest this dataset was built from.",
+    )
+    data_file: str = Field(description="Relative path to the data file, e.g. 'data.parquet'.")
+    data_content_hash: str = Field(description="sha256 of `data_file`.")
+    time_basis: TimeBasis
+    source_versions: list[SourceVersion] = Field(default_factory=list)
+    index_levels: list[str] = Field(
+        default_factory=lambda: ["Subject", "Session", "Task"],
+        description="MultiIndex level names on the dataset table.",
+    )
+    columns: list[tuple[str, str]] = Field(
+        default_factory=list,
+        description="(source, signal) pairs present on the table.",
+    )
+    extra: dict[str, Any] = Field(default_factory=dict)
+    def to_json(self, *, indent: int = 2) -> str:
+        return self.model_dump_json(indent=indent)
+    def write(self, path: str | Path) -> Path:
+        path = Path(path)
+        path.write_text(self.to_json(), encoding="utf-8")
+        return path
+    @classmethod
+    def read(cls, path: str | Path) -> "DatasetManifest":
+        raw = Path(path).read_text(encoding="utf-8")
+        return cls.model_validate_json(raw)
+    def content_hash(self) -> str:
+        canonical = json.dumps(
+            self.model_dump(mode="json"), sort_keys=True, separators=(",", ":")
+        )
+        return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+def hash_file(path: str | Path, *, chunk_size: int = 1 << 20) -> str:
+    """sha256 of a file's bytes. Use to populate `DatasetManifest.data_content_hash`."""
+    h = hashlib.sha256()
+    with open(path, "rb") as fh:
+        while True:
+            chunk = fh.read(chunk_size)
+            if not chunk:
+                break
+            h.update(chunk)
+    return h.hexdigest()

mesokit_schema-0.4.0/src/mesokit_schema/manifest.py ADDED Viewed

@@ -0,0 +1,167 @@
+from __future__ import annotations
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Literal, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from mesokit_schema.time import TimeBasis
+from mesokit_schema.version import SCHEMA_VERSION
+class SessionIdentity(BaseModel):
+    """The (subject, session, task) tuple that downstream uses as a primary key."""
+    model_config = ConfigDict(extra="forbid")
+    subject: str
+    session: str
+    task: Optional[str] = None
+    experimenter: Optional[str] = None
+    protocol: Optional[str] = None
+class DataqueuePayloadSchema(BaseModel):
+    """Typed contract for finding & decoding a producer's dataqueue rows.
+    Mesofield writes one shared `dataqueue.csv` per session with one row
+    per `record()` call across all producers; each row carries a
+    `device_id` and a `payload` column. The alignment-needing parsers
+    (treadmill, wheel) today regex-match against payload strings and
+    substring-search `device_id` to locate "their" producer's rows.
+    Those magic strings live in parser code with no formal link back
+    to what the producer actually pushed.
+    `DataqueuePayloadSchema` is the typed link. The producer declares
+    its `device_id` (as it appears in the dataqueue) and its payload
+    shape once; the parser reads this off `ProducerEntry.dataqueue_schema`
+    at ingest time instead of carrying its own regex.
+    Three payload shapes are supported:
+      - `"scalar"`: payload column holds one value per row (the wheel
+        encoder's int click count, for example). `payload_fields`
+        is empty.
+      - `"dict"`: payload column holds a dict-repr (treadmill's
+        `{distance, speed, device_us}`). `payload_fields` maps each
+        field name to a Python type literal (e.g. `"float"`, `"int"`).
+      - `"tagged_string"`: payload column holds a key=value string like
+        `"EncoderData timestamp=1234"`. `payload_fields` maps each
+        field name to the regex extracting it.
+    """
+    model_config = ConfigDict(extra="forbid")
+    device_id: str = Field(
+        description="Value the producer pushes under the dataqueue's device_id column.",
+    )
+    payload_format: Literal["scalar", "dict", "tagged_string"] = Field(
+        default="scalar",
+        description="How to decode the payload column.",
+    )
+    payload_fields: dict[str, str] = Field(
+        default_factory=dict,
+        description="Per-field type or regex; semantics depend on payload_format.",
+    )
+    description: Optional[str] = None
+class SidecarEntry(BaseModel):
+    """An auxiliary file declared by a producer next to its primary output.
+    Producers expose sidecars so consumers can dispatch on `role` instead of
+    file-name globbing. Examples: a camera's per-frame metadata JSON, a
+    mesomap mask, a regions table.
+    """
+    model_config = ConfigDict(extra="forbid")
+    path: str = Field(description="Sidecar path relative to the session root.")
+    role: str = Field(
+        description="What this sidecar is for, e.g. 'frame_metadata', 'mask', 'regions'."
+    )
+    schema_version: Optional[str] = Field(
+        default=None,
+        description="Producer-declared version of the sidecar's own format.",
+    )
+    description: Optional[str] = None
+class ProducerEntry(BaseModel):
+    """One entry per `DataProducer` that ran during the acquisition.
+    Replaces datakit's glob-based discovery: each entry declares exactly
+    which file was written, what type it is, and how to interpret its
+    timestamps.
+    """
+    model_config = ConfigDict(extra="forbid")
+    device_id: str
+    device_type: str
+    data_type: str = Field(description="Free-form tag matching datakit SOURCE_REGISTRY key.")
+    bids_type: Optional[str] = None
+    file_type: str = Field(description="e.g. 'ome.tiff', 'csv', 'json', 'mp4'.")
+    output_path: str = Field(description="Path relative to the session root.")
+    metadata_path: Optional[str] = Field(
+        default=None,
+        description="Primary metadata sidecar (e.g. camera frame metadata), relative to the session root.",
+    )
+    sampling_rate_hz: Optional[float] = None
+    time_basis: TimeBasis
+    calibration: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Device-specific constants (wheel CPR/diameter, pixel size, etc.).",
+    )
+    sidecars: list[SidecarEntry] = Field(
+        default_factory=list,
+        description="Additional declared sidecars beyond `metadata_path`.",
+    )
+    dataqueue_schema: Optional[DataqueuePayloadSchema] = Field(
+        default=None,
+        description="If the producer pushes rows to the session dataqueue, the "
+                    "typed contract for how a parser finds and decodes them.",
+    )
+class AcquisitionManifest(BaseModel):
+    """Top-level manifest written by mesofield at session shutdown.
+    A complete acquisition produces exactly one of these on disk
+    (`manifest.json`, alongside the data files). Datakit ingests by reading
+    this manifest instead of globbing.
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: str = Field(default=SCHEMA_VERSION)
+    mesofield_version: str
+    acquisition_complete: bool
+    started_at: datetime
+    ended_at: Optional[datetime] = None
+    session: SessionIdentity
+    producers: list[ProducerEntry]
+    notes: Optional[str] = None
+    extra: dict[str, Any] = Field(default_factory=dict)
+    def to_json(self, *, indent: int = 2) -> str:
+        return self.model_dump_json(indent=indent)
+    def write(self, path: str | Path) -> Path:
+        path = Path(path)
+        path.write_text(self.to_json(), encoding="utf-8")
+        return path
+    @classmethod
+    def read(cls, path: str | Path) -> "AcquisitionManifest":
+        raw = Path(path).read_text(encoding="utf-8")
+        return cls.model_validate_json(raw)
+    def content_hash(self) -> str:
+        """Stable sha256 of the canonical JSON form. Used to seed provenance downstream."""
+        canonical = json.dumps(
+            self.model_dump(mode="json"), sort_keys=True, separators=(",", ":")
+        )
+        return hashlib.sha256(canonical.encode("utf-8")).hexdigest()

mesokit_schema-0.4.0/src/mesokit_schema/paths.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""BIDS path construction helpers, shared across producer and consumer.
+Mesofield's `ExperimentConfig.make_path()` and datakit's path parsing both
+encode the same convention. Putting the helpers here removes that duplication.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+def bids_session_dir(root: str | Path, subject: str, session: str) -> Path:
+    """Return `<root>/data/sub-<subject>/ses-<session>/`."""
+    return Path(root) / "data" / f"sub-{subject}" / f"ses-{session}"
+def bids_stream_filename(
+    *,
+    subject: str,
+    session: str,
+    task: Optional[str],
+    suffix: str,
+    extension: str,
+    timestamp: Optional[str] = None,
+) -> str:
+    """Construct the canonical BIDS-style filename used across the pipeline.
+    Example: `251114_103045_sub-001_ses-1_task-runA_meso.ome.tiff`.
+    """
+    parts: list[str] = []
+    if timestamp:
+        parts.append(timestamp)
+    parts.append(f"sub-{subject}")
+    parts.append(f"ses-{session}")
+    if task:
+        parts.append(f"task-{task}")
+    parts.append(suffix)
+    stem = "_".join(parts)
+    ext = extension.lstrip(".")
+    return f"{stem}.{ext}"

mesokit_schema-0.4.0/src/mesokit_schema/processing.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Manifest contract for intermediate processing stages (DLC, mesomap, etc.).
+Mesofield writes raw acquisitions and emits `AcquisitionManifest`. Datakit
+consumes those. But many real pipelines have a *middle stage* — DeepLabCut
+runs on a pupil video, mesomap registers a widefield tiff to an atlas, a
+spike sorter chews on a recording — that produces derived files which
+datakit then ingests as if they were primary outputs.
+Those middle stages have historically lived outside the contract: outputs
+landed in `processed/`, datakit globbed for them, and there was no record
+of which model / atlas / parameters produced the file. A re-run with
+different parameters silently overwrote the previous output.
+`ProcessingManifest` closes that gap. Each processor emits a sidecar
+(`<tool_name>.process.json`) next to its outputs describing:
+- the tool that ran (name, version, invocation)
+- the inputs it consumed (path + sha256 content hash)
+- the parameters it ran with
+- the outputs it wrote (as `ProducerEntry` instances, same shape as raw
+  acquisition entries)
+- the upstream `AcquisitionManifest` hash, so the provenance chain extends
+  across the entire pipeline.
+`mesofield.processing.ProcessorRunner` is the helper that wraps any
+processing function and emits the sidecar automatically.
+"""
+from __future__ import annotations
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from mesokit_schema.manifest import ProducerEntry
+from mesokit_schema.version import SCHEMA_VERSION
+class InputRef(BaseModel):
+    """A processing input recorded by content hash, not just path."""
+    model_config = ConfigDict(extra="forbid")
+    path: str = Field(description="Absolute or session-relative path at processing time.")
+    content_hash: str = Field(description="sha256 of the input file's bytes.")
+    description: Optional[str] = None
+class ProcessingManifest(BaseModel):
+    """Manifest emitted by each intermediate processing stage.
+    Lives in the session's `processed/` directory as
+    `<tool_name>.process.json` (or a tool-specific subdirectory + filename
+    if the processor prefers).
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: str = Field(default=SCHEMA_VERSION)
+    tool_name: str = Field(description="Stable identifier, e.g. 'mesomap', 'deeplabcut'.")
+    tool_version: str
+    tool_invocation: str = Field(
+        description="Human-readable invocation: command line or function call."
+    )
+    built_at: datetime
+    upstream_acquisition_hash: Optional[str] = Field(
+        default=None,
+        description="sha256 of the AcquisitionManifest this processing run was rooted at.",
+    )
+    inputs: list[InputRef] = Field(default_factory=list)
+    parameters: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Free-form: model snapshot, atlas version, thresholds, ...",
+    )
+    outputs: list[ProducerEntry] = Field(
+        default_factory=list,
+        description="Each derived file declared as a producer entry, same shape "
+                    "as the AcquisitionManifest's `producers`.",
+    )
+    extra: dict[str, Any] = Field(default_factory=dict)
+    def to_json(self, *, indent: int = 2) -> str:
+        return self.model_dump_json(indent=indent)
+    def write(self, path: str | Path) -> Path:
+        path = Path(path)
+        path.write_text(self.to_json(), encoding="utf-8")
+        return path
+    @classmethod
+    def read(cls, path: str | Path) -> "ProcessingManifest":
+        raw = Path(path).read_text(encoding="utf-8")
+        return cls.model_validate_json(raw)
+    def content_hash(self) -> str:
+        canonical = json.dumps(
+            self.model_dump(mode="json"), sort_keys=True, separators=(",", ":")
+        )
+        return hashlib.sha256(canonical.encode("utf-8")).hexdigest()

mesokit_schema-0.4.0/src/mesokit_schema/time.py ADDED Viewed

@@ -0,0 +1,37 @@
+from __future__ import annotations
+from typing import Literal, Optional
+from pydantic import BaseModel, ConfigDict, Field
+ClockSource = Literal[
+    "wall_unix_s",
+    "micromanager_core",
+    "monotonic_s",
+    "hardware_ttl",
+    "derived",
+]
+class TimeBasis(BaseModel):
+    """Declares how a stream's timestamps relate to a real-world clock.
+    A `TimeBasis` is attached per-producer in `AcquisitionManifest` and once
+    more as the master timeline in `DatasetManifest`. The downstream consumer
+    uses it to align streams deterministically instead of guessing from
+    column names.
+    """
+    model_config = ConfigDict(extra="forbid")
+    clock_source: ClockSource
+    units: Literal["seconds"] = "seconds"
+    epoch_unix_s: Optional[float] = Field(
+        default=None,
+        description="If timestamps are relative, the absolute unix time of t=0.",
+    )
+    known_offset_s: float = Field(
+        default=0.0,
+        description="Static offset to add to recorded timestamps to align with master.",
+    )
+    description: Optional[str] = None

mesokit_schema-0.4.0/src/mesokit_schema/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ SCHEMA_VERSION = "0.4.0"

mesokit_schema-0.4.0/src/mesokit_schema.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,92 @@
+Metadata-Version: 2.4
+Name: mesokit-schema
+Version: 0.4.0
+Summary: Shared data contract for the mesofield -> datakit -> databench pipeline.
+Author: Jacob Gronemeyer
+License: MIT License
+        Copyright (c) 2026 Gronemeyer
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/Gronemeyer/mesokit-schema
+Project-URL: Repository, https://github.com/Gronemeyer/mesokit-schema
+Project-URL: Issues, https://github.com/Gronemeyer/mesokit-schema/issues
+Keywords: neuroscience,schema,pipeline,bids,mesofield
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pydantic<3.0,>=2.0
+Provides-Extra: test
+Requires-Dist: pytest; extra == "test"
+Dynamic: license-file
+# mesokit-schema
+Shared data contract for the **mesofield → datakit → databench** pipeline.
+This package defines the typed manifests that travel alongside data between the three repos. It is the single source of truth for:
+- **`AcquisitionManifest`** — what mesofield writes at the end of a session: the list of producers that ran, the files they wrote, calibration constants, time bases, and session identity. Datakit consumes this instead of globbing the filesystem.
+- **`DatasetManifest`** — what datakit writes alongside the dataset table: schema version, datakit version, content hash of the data file, hash of the upstream acquisition manifest, master `TimeBasis`, source-parser versions, declared columns. Databench validates against this on load and concatenates it into provenance.
+- **`AnalysisDeclaration`** — what each databench analysis declares about its inputs and parameters, so a dataset can be validated against an analysis *before* it runs.
+Plus shared building blocks: `TimeBasis`, `ColumnSpec`, BIDS path helpers, and a single `SCHEMA_VERSION` constant.
+## Install
+From PyPI:
+```sh
+pip install mesokit-schema
+```
+From GitHub (latest main):
+```sh
+pip install git+https://github.com/Gronemeyer/mesokit-schema.git
+```
+### Development
+For local development across the pipeline, editable-install into all three envs at once:
+```sh
+./scripts/install-dev.sh                  # all envs
+./scripts/install-dev.sh mesofield        # one env
+```
+The script editable-installs `mesokit-schema` plus the env's own package (`mesofield`, `datakit`, `databench`) into each matching conda env. Override layout via `CONDA_ENVS_ROOT`, `DEV_ROOT`, or `DATABENCH_ROOT` env vars.
+Or install manually:
+```sh
+pip install -e /path/to/mesokit-schema
+```
+## Versioning
+The package exposes `mesokit_schema.SCHEMA_VERSION` (semver). Consumers compare against the `schema_version` field on each manifest and refuse to load anything they don't know how to read. Bump the major version on breaking changes; minor on additive fields.

mesokit_schema-0.4.0/src/mesokit_schema.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,17 @@
+LICENSE
+README.md
+pyproject.toml
+src/mesokit_schema/__init__.py
+src/mesokit_schema/analysis.py
+src/mesokit_schema/dataset.py
+src/mesokit_schema/manifest.py
+src/mesokit_schema/paths.py
+src/mesokit_schema/processing.py
+src/mesokit_schema/time.py
+src/mesokit_schema/version.py
+src/mesokit_schema.egg-info/PKG-INFO
+src/mesokit_schema.egg-info/SOURCES.txt
+src/mesokit_schema.egg-info/dependency_links.txt
+src/mesokit_schema.egg-info/requires.txt
+src/mesokit_schema.egg-info/top_level.txt
+tests/test_roundtrip.py

mesokit_schema-0.4.0/src/mesokit_schema.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

mesokit_schema-0.4.0/src/mesokit_schema.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+pydantic<3.0,>=2.0
+[test]
+pytest

mesokit_schema-0.4.0/src/mesokit_schema.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ mesokit_schema

mesokit_schema-0.4.0/tests/test_roundtrip.py ADDED Viewed

@@ -0,0 +1,142 @@
+from datetime import datetime, timezone
+from pathlib import Path
+from mesokit_schema import (
+    SCHEMA_VERSION,
+    AcquisitionManifest,
+    AnalysisDeclaration,
+    ColumnSpec,
+    DatasetManifest,
+    InputRef,
+    ProcessingManifest,
+    ProducerEntry,
+    SessionIdentity,
+    SourceVersion,
+    TimeBasis,
+    bids_session_dir,
+    bids_stream_filename,
+)
+def _sample_acquisition() -> AcquisitionManifest:
+    return AcquisitionManifest(
+        mesofield_version="0.0.1",
+        acquisition_complete=True,
+        started_at=datetime(2026, 5, 14, 10, 0, tzinfo=timezone.utc),
+        session=SessionIdentity(subject="001", session="1", task="runA"),
+        producers=[
+            ProducerEntry(
+                device_id="meso",
+                device_type="MMCamera",
+                data_type="meso_metadata",
+                bids_type="func",
+                file_type="ome.tiff",
+                output_path="data/sub-001/ses-1/func/meso.ome.tiff",
+                metadata_path="data/sub-001/ses-1/func/meso.ome.tiff_frame_metadata.json",
+                sampling_rate_hz=30.0,
+                time_basis=TimeBasis(clock_source="micromanager_core"),
+            ),
+            ProducerEntry(
+                device_id="wheel",
+                device_type="SerialWorker",
+                data_type="wheel",
+                file_type="csv",
+                output_path="data/sub-001/ses-1/beh/wheel.csv",
+                time_basis=TimeBasis(clock_source="wall_unix_s"),
+                calibration={"cpr": 360, "diameter_mm": 152.4},
+            ),
+        ],
+    )
+def test_acquisition_manifest_roundtrip(tmp_path: Path) -> None:
+    manifest = _sample_acquisition()
+    out = tmp_path / "manifest.json"
+    manifest.write(out)
+    loaded = AcquisitionManifest.read(out)
+    assert loaded == manifest
+    assert loaded.schema_version == SCHEMA_VERSION
+    assert loaded.content_hash() == manifest.content_hash()
+def test_dataset_manifest_roundtrip(tmp_path: Path) -> None:
+    upstream = _sample_acquisition()
+    ds = DatasetManifest(
+        datakit_version="2025.11.22",
+        built_at=datetime(2026, 5, 14, 11, 0, tzinfo=timezone.utc),
+        upstream_acquisition_hash=upstream.content_hash(),
+        data_file="data.parquet",
+        data_content_hash="0" * 64,
+        time_basis=TimeBasis(clock_source="derived", description="datakit master timeline"),
+        source_versions=[
+            SourceVersion(
+                tag="mesomap",
+                version="0.3.0",
+                parser_class="datakit.sources.analysis.mesomap.MesoMapSource",
+            ),
+        ],
+        columns=[("mesomap", "L_VISp"), ("wheel", "speed_mm")],
+    )
+    out = tmp_path / "dataset_manifest.json"
+    ds.write(out)
+    loaded = DatasetManifest.read(out)
+    assert loaded == ds
+    assert loaded.upstream_acquisition_hash == upstream.content_hash()
+def test_processing_manifest_roundtrip(tmp_path: Path) -> None:
+    upstream = _sample_acquisition()
+    proc_manifest = ProcessingManifest(
+        tool_name="mock_regional_means",
+        tool_version="0.1.0",
+        tool_invocation="MockRegionalMeans(n_regions=4).run([tiff_path])",
+        built_at=datetime(2026, 5, 14, 12, 0, tzinfo=timezone.utc),
+        upstream_acquisition_hash=upstream.content_hash(),
+        inputs=[
+            InputRef(path="func/meso.ome.tiff", content_hash="a" * 64),
+        ],
+        parameters={"n_regions": 4, "baseline_frames": 10},
+        outputs=[
+            ProducerEntry(
+                device_id="mock_regional_means",
+                device_type="processor",
+                data_type="regional_means",
+                bids_type="func",
+                file_type="csv",
+                output_path="processed/regional_means.csv",
+                time_basis=TimeBasis(clock_source="derived"),
+            ),
+        ],
+    )
+    out = tmp_path / "regional_means.process.json"
+    proc_manifest.write(out)
+    loaded = ProcessingManifest.read(out)
+    assert loaded == proc_manifest
+    assert loaded.schema_version == SCHEMA_VERSION
+    assert loaded.upstream_acquisition_hash == upstream.content_hash()
+    assert len(loaded.outputs) == 1
+    assert loaded.outputs[0].data_type == "regional_means"
+def test_analysis_declaration() -> None:
+    decl = AnalysisDeclaration(
+        name="locomotion_bouts",
+        version="0.1.0",
+        required_signals=[ColumnSpec(source="wheel", signal="speed_mm", unit="mm/s")],
+    )
+    assert decl.required_signals[0].required is True
+def test_paths() -> None:
+    assert bids_session_dir("/x", "001", "1") == Path("/x/data/sub-001/ses-1")
+    assert (
+        bids_stream_filename(
+            subject="001",
+            session="1",
+            task="runA",
+            suffix="meso",
+            extension="ome.tiff",
+            timestamp="251114_103045",
+        )
+        == "251114_103045_sub-001_ses-1_task-runA_meso.ome.tiff"
+    )