labelpull 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ concurrency:
9
+ group: ci-${{ github.workflow }}-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ test:
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+
22
+ - uses: astral-sh/setup-uv@v3
23
+ with:
24
+ enable-cache: true
25
+ cache-dependency-glob: "**/pyproject.toml"
26
+
27
+ - name: Set up Python ${{ matrix.python-version }}
28
+ run: uv python install ${{ matrix.python-version }}
29
+
30
+ - name: Install
31
+ run: |
32
+ uv venv --python ${{ matrix.python-version }} .venv
33
+ uv pip install -e ".[dev]"
34
+
35
+ - name: Lint
36
+ run: |
37
+ source .venv/bin/activate
38
+ ruff check src tests
39
+ ruff format --check src tests
40
+
41
+ - name: Type-check
42
+ run: |
43
+ source .venv/bin/activate
44
+ mypy --strict --no-warn-unused-ignores src/labelpull
45
+
46
+ - name: Test
47
+ run: |
48
+ source .venv/bin/activate
49
+ pytest --cov=labelpull --cov-report=term-missing -q
50
+
51
+ build:
52
+ runs-on: ubuntu-latest
53
+ needs: test
54
+ steps:
55
+ - uses: actions/checkout@v4
56
+ - uses: astral-sh/setup-uv@v3
57
+ - name: Build sdist + wheel
58
+ run: uv build
59
+ - name: Check metadata renders on PyPI
60
+ run: uvx twine check dist/*
61
+ - uses: actions/upload-artifact@v4
62
+ with:
63
+ name: dist
64
+ path: dist/
@@ -0,0 +1,39 @@
1
+ name: Release
2
+
3
+ # Publish to PyPI when a version tag is pushed (e.g. `git tag v0.1.0 && git push --tags`).
4
+ # Uses PyPI trusted publishing (OIDC) — no API token secret required. To enable:
5
+ # 1. Create the project on PyPI and add this repo as a trusted publisher
6
+ # (workflow `release.yml`, environment `pypi`).
7
+ # 2. Create a GitHub environment named `pypi` (optionally with required reviewers).
8
+
9
+ on:
10
+ push:
11
+ tags: ["v*"]
12
+
13
+ jobs:
14
+ build:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - uses: astral-sh/setup-uv@v3
19
+ - name: Build sdist + wheel
20
+ run: uv build
21
+ - name: Check metadata renders on PyPI
22
+ run: uvx twine check dist/*
23
+ - uses: actions/upload-artifact@v4
24
+ with:
25
+ name: dist
26
+ path: dist/
27
+
28
+ publish:
29
+ needs: build
30
+ runs-on: ubuntu-latest
31
+ environment: pypi
32
+ permissions:
33
+ id-token: write # OIDC token for PyPI trusted publishing
34
+ steps:
35
+ - uses: actions/download-artifact@v4
36
+ with:
37
+ name: dist
38
+ path: dist/
39
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .mypy_cache/
7
+ dist/
8
+ *.egg-info/
9
+ .coverage
10
+ .claude-flow/
@@ -0,0 +1,69 @@
1
+ Metadata-Version: 2.4
2
+ Name: labelpull
3
+ Version: 0.1.0
4
+ Summary: Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
5
+ Author-email: Wietze Suijker <wietze.suijker@gmail.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: typer>=0.9
9
+ Provides-Extra: dev
10
+ Requires-Dist: mypy>=1.8; extra == 'dev'
11
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
12
+ Requires-Dist: pytest>=7.4; extra == 'dev'
13
+ Requires-Dist: ruff>=0.4; extra == 'dev'
14
+ Provides-Extra: live
15
+ Requires-Dist: labelbox>=7.0; extra == 'live'
16
+ Description-Content-Type: text/markdown
17
+
18
+ # labelpull
19
+
20
+ Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
21
+
22
+ The Labelbox SDK already exports a project's labels and streams them. What it
23
+ doesn't give you is a *tabular* view of that deeply nested JSON, the correctness
24
+ logic to pick the right label when a row was reviewed, or a workflow status that
25
+ is always populated. `labelpull` is exactly that thin layer on top of the SDK.
26
+
27
+ ## Install
28
+
29
+ ```bash
30
+ pip install labelpull # offline parsing + CLI
31
+ pip install 'labelpull[live]' # + the Labelbox SDK for live pulls
32
+ ```
33
+
34
+ ## CLI
35
+
36
+ ```bash
37
+ export LABELBOX_API_KEY=...
38
+ labelpull pull <PROJECT_ID> -o labels.csv # generic long CSV (any ontology)
39
+ labelpull pull <PROJECT_ID> --status Done # only verified rows
40
+ labelpull pull <PROJECT_ID> --since 2026-06-01 # only the latest labels
41
+ labelpull pull <PROJECT_ID> --from-export export.ndjson # offline, no API key
42
+ labelpull pull <PROJECT_ID> --schema species -o taxa.csv # speciesfirst Taxon/Organs wide CSV
43
+ ```
44
+
45
+ `--schema generic` (default) writes one row per feature — every classification
46
+ and object, any ontology:
47
+
48
+ ```
49
+ global_key,data_row_id,feature_kind,feature_name,value,workflow_status,labeled_by,created_at,parent_feature_id
50
+ ```
51
+
52
+ ## Library
53
+
54
+ ```python
55
+ import labelpull
56
+
57
+ rows = list(labelpull.export("proj_id", status="Done")) # or read_export_file("export.ndjson")
58
+ features = [f for r in rows for f in labelpull.flatten(r, "proj_id")]
59
+ labelpull.write_csv("labels.csv", labelpull.GenericAdapter(), features)
60
+ print(labelpull.summarize(rows, features))
61
+ ```
62
+
63
+ `flatten()` handles radio / checklist / text classifications and bbox / polygon /
64
+ line / point / mask objects (with nested classifications linked to their parent),
65
+ and always selects the most recently created label so a QC-reviewed row reports
66
+ the reviewer's answer, not the annotator's.
67
+
68
+ Write your own `Adapter` to collapse features into a project-specific wide table;
69
+ `SpeciesAdapter` is the reference implementation.
@@ -0,0 +1,52 @@
1
+ # labelpull
2
+
3
+ Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
4
+
5
+ The Labelbox SDK already exports a project's labels and streams them. What it
6
+ doesn't give you is a *tabular* view of that deeply nested JSON, the correctness
7
+ logic to pick the right label when a row was reviewed, or a workflow status that
8
+ is always populated. `labelpull` is exactly that thin layer on top of the SDK.
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install labelpull # offline parsing + CLI
14
+ pip install 'labelpull[live]' # + the Labelbox SDK for live pulls
15
+ ```
16
+
17
+ ## CLI
18
+
19
+ ```bash
20
+ export LABELBOX_API_KEY=...
21
+ labelpull pull <PROJECT_ID> -o labels.csv # generic long CSV (any ontology)
22
+ labelpull pull <PROJECT_ID> --status Done # only verified rows
23
+ labelpull pull <PROJECT_ID> --since 2026-06-01 # only the latest labels
24
+ labelpull pull <PROJECT_ID> --from-export export.ndjson # offline, no API key
25
+ labelpull pull <PROJECT_ID> --schema species -o taxa.csv # speciesfirst Taxon/Organs wide CSV
26
+ ```
27
+
28
+ `--schema generic` (default) writes one row per feature — every classification
29
+ and object, any ontology:
30
+
31
+ ```
32
+ global_key,data_row_id,feature_kind,feature_name,value,workflow_status,labeled_by,created_at,parent_feature_id
33
+ ```
34
+
35
+ ## Library
36
+
37
+ ```python
38
+ import labelpull
39
+
40
+ rows = list(labelpull.export("proj_id", status="Done")) # or read_export_file("export.ndjson")
41
+ features = [f for r in rows for f in labelpull.flatten(r, "proj_id")]
42
+ labelpull.write_csv("labels.csv", labelpull.GenericAdapter(), features)
43
+ print(labelpull.summarize(rows, features))
44
+ ```
45
+
46
+ `flatten()` handles radio / checklist / text classifications and bbox / polygon /
47
+ line / point / mask objects (with nested classifications linked to their parent),
48
+ and always selects the most recently created label so a QC-reviewed row reports
49
+ the reviewer's answer, not the annotator's.
50
+
51
+ Write your own `Adapter` to collapse features into a project-specific wide table;
52
+ `SpeciesAdapter` is the reference implementation.
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "labelpull"
7
+ version = "0.1.0"
8
+ description = "Pull the latest Labelbox annotations into a tidy, ontology-agnostic table."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [{ name = "Wietze Suijker", email = "wietze.suijker@gmail.com" }]
13
+ dependencies = ["typer>=0.9"]
14
+
15
+ [project.optional-dependencies]
16
+ live = ["labelbox>=7.0"]
17
+ dev = ["pytest>=7.4", "pytest-cov>=4.1", "ruff>=0.4", "mypy>=1.8"]
18
+
19
+ [project.scripts]
20
+ labelpull = "labelpull.cli:app"
21
+
22
+ [tool.hatch.build.targets.wheel]
23
+ packages = ["src/labelpull"]
24
+
25
+ [tool.ruff]
26
+ line-length = 100
27
+ target-version = "py310"
28
+
29
+ [tool.ruff.lint]
30
+ select = ["E", "F", "I", "B", "UP", "SIM", "PL"]
31
+
32
+ [tool.ruff.lint.per-file-ignores]
33
+ # Typer expresses CLI args as call-defaults (B008) and wide signatures (PLR0913).
34
+ "src/labelpull/cli.py" = ["B008", "PLR0913"]
35
+ # Tests assert against literal expected values (PLR2004) and import lazily (PLC0415).
36
+ "tests/*" = ["PLR2004", "PLC0415", "E501"]
37
+
38
+ [tool.pytest.ini_options]
39
+ testpaths = ["tests"]
40
+
41
+ [[tool.mypy.overrides]]
42
+ module = "labelbox"
43
+ ignore_missing_imports = true
@@ -0,0 +1,44 @@
1
+ """labelpull: pull the latest Labelbox annotations into a tidy table.
2
+
3
+ The Labelbox SDK exports a project's labels as nested, ontology-shaped JSON.
4
+ labelpull is the thin layer the SDK lacks: a generic flattener
5
+ (:func:`~labelpull.core.flatten`) plus the correctness logic (latest-label
6
+ selection, status normalization) and a one-command CLI on top.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from labelpull.adapters import (
12
+ ADAPTERS,
13
+ Adapter,
14
+ GenericAdapter,
15
+ SpeciesAdapter,
16
+ write_csv,
17
+ )
18
+ from labelpull.core import (
19
+ WORKFLOW_STATUSES,
20
+ FeatureRow,
21
+ Summary,
22
+ export,
23
+ flatten,
24
+ read_export_file,
25
+ summarize,
26
+ )
27
+
28
+ __version__ = "0.1.0"
29
+
30
+ __all__ = [
31
+ "ADAPTERS",
32
+ "WORKFLOW_STATUSES",
33
+ "Adapter",
34
+ "FeatureRow",
35
+ "GenericAdapter",
36
+ "SpeciesAdapter",
37
+ "Summary",
38
+ "__version__",
39
+ "export",
40
+ "flatten",
41
+ "read_export_file",
42
+ "summarize",
43
+ "write_csv",
44
+ ]
@@ -0,0 +1,110 @@
1
+ """Adapters: collapse ontology-agnostic :class:`FeatureRow` rows into a shape.
2
+
3
+ The generic path writes ``FeatureRow`` rows straight to a long-format CSV that
4
+ any project can read. An adapter narrows that to a project-specific wide record.
5
+ :class:`SpeciesAdapter` is the reference implementation, reproducing
6
+ speciesfirst's ``global_key,taxon,organs,labeled_by,workflow_status`` pull CSV
7
+ from the generic rows, so the engine has exactly one parser.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import csv
13
+ from collections import OrderedDict
14
+ from collections.abc import Iterable, Sequence
15
+ from pathlib import Path
16
+ from typing import Protocol, runtime_checkable
17
+
18
+ from labelpull.core import FeatureRow
19
+
20
+
21
+ @runtime_checkable
22
+ class Adapter(Protocol):
23
+ """Map flattened features to named columns plus the rows to write."""
24
+
25
+ columns: Sequence[str]
26
+
27
+ def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]: ...
28
+
29
+
30
+ class GenericAdapter:
31
+ """One CSV row per feature: the ontology-agnostic long format."""
32
+
33
+ columns: Sequence[str] = (
34
+ "global_key",
35
+ "data_row_id",
36
+ "feature_kind",
37
+ "feature_name",
38
+ "value",
39
+ "workflow_status",
40
+ "labeled_by",
41
+ "created_at",
42
+ "parent_feature_id",
43
+ )
44
+
45
+ def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]:
46
+ for f in features:
47
+ yield (
48
+ f.global_key,
49
+ f.data_row_id,
50
+ f.feature_kind,
51
+ f.feature_name,
52
+ f.value,
53
+ f.workflow_status or "",
54
+ f.labeled_by or "",
55
+ f.created_at or "",
56
+ f.parent_feature_id,
57
+ )
58
+
59
+
60
+ class SpeciesAdapter:
61
+ """One row per ``global_key``: reproduces speciesfirst's pull CSV.
62
+
63
+ ``taxon`` is the ``Taxon`` single-select radio; ``organs`` is the ``Organs``
64
+ checklist (``;``-joined). A reached-and-labelled row with neither still
65
+ appears (seeded by the ``label`` sentinel), matching speciesfirst's "reached
66
+ but unlabelled yields ``taxon=''``" behaviour. Insertion order follows the
67
+ export stream.
68
+ """
69
+
70
+ columns: Sequence[str] = ("global_key", "taxon", "organs", "labeled_by", "workflow_status")
71
+ taxon_feature = "Taxon"
72
+ organs_feature = "Organs"
73
+
74
+ def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]:
75
+ by_key: OrderedDict[str, dict[str, str]] = OrderedDict()
76
+ for f in features:
77
+ rec = by_key.setdefault(
78
+ f.global_key,
79
+ {"taxon": "", "organs": "", "labeled_by": "", "workflow_status": ""},
80
+ )
81
+ if f.labeled_by:
82
+ rec["labeled_by"] = f.labeled_by
83
+ if f.workflow_status:
84
+ rec["workflow_status"] = f.workflow_status
85
+ if f.feature_kind == "radio" and f.feature_name == self.taxon_feature and f.value:
86
+ rec["taxon"] = f.value
87
+ elif f.feature_kind == "checklist" and f.feature_name == self.organs_feature:
88
+ rec["organs"] = f.value
89
+ for global_key, rec in by_key.items():
90
+ yield (
91
+ global_key,
92
+ rec["taxon"],
93
+ rec["organs"],
94
+ rec["labeled_by"],
95
+ rec["workflow_status"],
96
+ )
97
+
98
+
99
+ ADAPTERS: dict[str, type] = {"generic": GenericAdapter, "species": SpeciesAdapter}
100
+
101
+
102
+ def write_csv(path: str | Path, adapter: Adapter, features: Iterable[FeatureRow]) -> Path:
103
+ """Write ``features`` through ``adapter`` to ``path`` (parents created)."""
104
+ path = Path(path)
105
+ path.parent.mkdir(parents=True, exist_ok=True)
106
+ with path.open("w", newline="") as f:
107
+ writer = csv.writer(f)
108
+ writer.writerow(adapter.columns)
109
+ writer.writerows(adapter.rows(features))
110
+ return path
@@ -0,0 +1,101 @@
1
+ """``labelpull`` CLI: pull the latest Labelbox annotations to a tidy CSV.
2
+
3
+ labelpull pull PROJECT_ID -o labels.csv
4
+ labelpull pull PROJECT_ID --status Done --since 2026-06-01
5
+ labelpull pull PROJECT_ID --schema species -o taxa.csv
6
+ labelpull pull PROJECT_ID --from-export export.ndjson # offline, no API key
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+
13
+ import typer
14
+
15
+ from labelpull import __version__
16
+ from labelpull.adapters import ADAPTERS, write_csv
17
+ from labelpull.core import (
18
+ FeatureRow,
19
+ JsonDict,
20
+ _created_at,
21
+ _select_project,
22
+ flatten,
23
+ read_export_file,
24
+ summarize,
25
+ )
26
+ from labelpull.core import export as live_export
27
+
28
+ app = typer.Typer(add_completion=False, help="Pull the latest Labelbox annotations to CSV.")
29
+
30
+
31
+ @app.callback()
32
+ def _main() -> None:
33
+ """labelpull: pull the latest Labelbox annotations into a tidy table."""
34
+
35
+
36
+ @app.command()
37
+ def pull(
38
+ project_id: str = typer.Argument(..., help="Labelbox project id to export from."),
39
+ out: Path = typer.Option(
40
+ Path("pulled_labels.csv"), "--out", "-o", help="Where to write the CSV."
41
+ ),
42
+ schema: str = typer.Option(
43
+ "generic",
44
+ help="generic = one row per feature (any ontology); "
45
+ "species = speciesfirst Taxon/Organs wide CSV.",
46
+ ),
47
+ status: str | None = typer.Option(
48
+ None, help="Filter by task-queue stage: ToLabel | InReview | InRework | Done."
49
+ ),
50
+ since: str | None = typer.Option(
51
+ None, help="Keep only rows whose newest label was created on/after this ISO date/time."
52
+ ),
53
+ from_export: Path | None = typer.Option(
54
+ None,
55
+ exists=True,
56
+ dir_okay=False,
57
+ help="Flatten a saved export (JSON/NDJSON) offline instead of the live API.",
58
+ ),
59
+ api_key: str | None = typer.Option(None, help="Labelbox API key (else LABELBOX_API_KEY)."),
60
+ ) -> None:
61
+ """Export the latest annotations and flatten them to CSV, with a summary."""
62
+ if schema not in ADAPTERS:
63
+ raise typer.BadParameter(f"unknown schema {schema!r}; choose from {sorted(ADAPTERS)}")
64
+ adapter = ADAPTERS[schema]()
65
+
66
+ typer.echo(f"labelpull v{__version__}")
67
+ if from_export is not None:
68
+ rows = read_export_file(from_export)
69
+ if since is not None:
70
+ rows = [r for r in rows if _row_since(r, project_id, since)]
71
+ typer.echo(f" read {len(rows)} rows from {from_export}")
72
+ else:
73
+ rows = list(live_export(project_id, status=status, since=since, api_key=api_key))
74
+ typer.echo(f" exported {len(rows)} rows from project {project_id}")
75
+
76
+ features = [f for r in rows for f in flatten(r, project_id)]
77
+ _print_summary(rows, features)
78
+ write_csv(out, adapter, features)
79
+ typer.echo(f"wrote {schema} CSV: {out}")
80
+
81
+
82
+ def _row_since(dr: JsonDict, project_id: str, since: str) -> bool:
83
+ return _created_at(_select_project(dr, project_id)) >= since
84
+
85
+
86
+ def _print_summary(rows: list[JsonDict], features: list[FeatureRow]) -> None:
87
+ s = summarize(rows, features)
88
+ typer.echo(
89
+ f" {s.n_labelled} labelled / {s.n_data_rows} rows "
90
+ f"({s.n_reached_unlabelled} reached unlabelled)"
91
+ )
92
+ if s.statuses:
93
+ typer.echo(" status: " + ", ".join(f"{k}={v}" for k, v in sorted(s.statuses.items())))
94
+ if s.feature_kinds:
95
+ typer.echo(" kinds: " + ", ".join(f"{k}={v}" for k, v in sorted(s.feature_kinds.items())))
96
+ if s.latest_created_at:
97
+ typer.echo(f" latest label: {s.latest_created_at}")
98
+
99
+
100
+ if __name__ == "__main__": # pragma: no cover
101
+ app()
@@ -0,0 +1,285 @@
1
+ """Ontology-agnostic Labelbox export + flatten.
2
+
3
+ The Labelbox SDK already exports a project's labels and streams them as deeply
4
+ nested, ontology-shaped JSON. What it does *not* give you is a tabular view, the
5
+ correctness logic to pick the right label when a row was reviewed, or a workflow
6
+ status that is always populated. This module is exactly that thin layer:
7
+
8
+ * :func:`export` wraps ``project.export(...)`` + ``wait_till_done()`` +
9
+ ``get_buffered_stream()`` (SDK lazy-imported, so it is optional) and adds a
10
+ ``since`` filter for "only the latest annotations".
11
+ * :func:`flatten` turns one export row into :class:`FeatureRow` long-format rows,
12
+ covering *every* feature kind (classifications AND objects) without assuming a
13
+ particular ontology. It encodes the two traps a hand-written parser gets wrong:
14
+ selecting the most-recently-created label (a QC-reviewed row carries both the
15
+ annotator's and the reviewer's label) and normalizing the workflow status.
16
+ * :func:`read_export_file` parses a saved export (UI download or a prior pull) so
17
+ the same flattener runs offline, no API key required.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import os
24
+ from collections.abc import Iterable, Iterator
25
+ from dataclasses import dataclass
26
+ from pathlib import Path
27
+ from typing import Any, cast
28
+
29
+ # One export row (and its nested blocks) is arbitrary JSON; alias it for brevity.
30
+ JsonDict = dict[str, Any]
31
+
32
+ # The task-queue stages ``project.export(filters={"workflow_status": ...})`` accepts.
33
+ WORKFLOW_STATUSES = ("ToLabel", "InReview", "InRework", "Done")
34
+
35
+ # Geometry keys a localized object may carry in the v7 export, in probe order.
36
+ _GEOMETRY_KINDS = ("bounding_box", "polygon", "line", "point", "mask")
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class FeatureRow:
41
+ """One ``(label, feature)`` pair from an export row, ontology-agnostic.
42
+
43
+ A classification answer or a localized object. An object's nested
44
+ classifications become their own rows, linked to the object via
45
+ :attr:`parent_feature_id`. Each labelled data row also yields one
46
+ ``feature_kind="label"`` sentinel row (no feature, ``value=""``) so that a
47
+ reached-and-labelled row is always represented even when empty.
48
+ """
49
+
50
+ global_key: str
51
+ data_row_id: str
52
+ # one of: label, radio, checklist, text, bounding_box, polygon,
53
+ # line, point, mask, relationship, unknown
54
+ feature_kind: str
55
+ feature_name: str
56
+ value: str # answer value(s) / compact geometry; "" when none
57
+ workflow_status: str | None
58
+ labeled_by: str | None
59
+ created_at: str | None
60
+ parent_feature_id: str # "" for top-level features
61
+
62
+
63
+ def export(
64
+ project_id: str,
65
+ *,
66
+ status: str | None = None,
67
+ since: str | None = None,
68
+ api_key: str | None = None,
69
+ client: Any | None = None,
70
+ ) -> Iterator[JsonDict]:
71
+ """Stream export rows (one dict per data row) for ``project_id``.
72
+
73
+ ``status`` filters by task-queue stage (see :data:`WORKFLOW_STATUSES`).
74
+ ``since`` keeps only rows whose newest label was created on/after an ISO
75
+ date/datetime string (lexicographic compare on the ISO timestamp). Pass
76
+ ``client`` to inject a stub; otherwise the ``labelbox`` SDK is imported
77
+ lazily and a client is built from ``api_key`` or ``LABELBOX_API_KEY``.
78
+ """
79
+ cl = client if client is not None else _make_client(api_key)
80
+ project = cl.get_project(project_id)
81
+ filters = {"workflow_status": status} if status else None
82
+ task = project.export(
83
+ params={"data_row_details": True, "label_details": True, "project_details": True},
84
+ filters=filters,
85
+ )
86
+ task.wait_till_done()
87
+ for row in task.get_buffered_stream():
88
+ dr = row.json
89
+ if since is None or _created_at(_select_project(dr, project_id)) >= since:
90
+ yield dr
91
+
92
+
93
+ def read_export_file(path: str | Path) -> list[JsonDict]:
94
+ """Load a saved export (JSON array or NDJSON) for offline flattening."""
95
+ text = Path(path).read_text().strip()
96
+ if not text:
97
+ return []
98
+ try:
99
+ loaded = json.loads(text)
100
+ return loaded if isinstance(loaded, list) else [loaded]
101
+ except json.JSONDecodeError:
102
+ return [json.loads(line) for line in text.splitlines() if line.strip()]
103
+
104
+
105
+ def flatten(dr: JsonDict, project_id: str | None = None) -> list[FeatureRow]:
106
+ """Flatten one export row into :class:`FeatureRow` rows (every feature).
107
+
108
+ ``project_id`` selects which project's labels to read; ``None`` uses the only
109
+ project present (the common single-project export) and returns nothing if the
110
+ row is ambiguous (multiple projects) so a caller never silently mixes them.
111
+ An unreached or unlabelled row yields ``[]``.
112
+ """
113
+ data_row = dr.get("data_row") or {}
114
+ global_key = data_row.get("global_key") or ""
115
+ data_row_id = data_row.get("id") or global_key
116
+ proj = _select_project(dr, project_id)
117
+ label = _latest_label(proj)
118
+ if not global_key or label is None:
119
+ return []
120
+
121
+ status = _workflow_status(proj)
122
+ details = label.get("label_details") or {}
123
+ labeled_by = details.get("created_by")
124
+ created_at = details.get("created_at")
125
+ ann = label.get("annotations") or {}
126
+ rows: list[FeatureRow] = []
127
+
128
+ def emit(kind: str, name: str | None, value: str, parent: str = "") -> None:
129
+ rows.append(
130
+ FeatureRow(
131
+ global_key,
132
+ data_row_id,
133
+ kind,
134
+ name or "",
135
+ value,
136
+ status,
137
+ labeled_by,
138
+ created_at,
139
+ parent,
140
+ )
141
+ )
142
+
143
+ # Sentinel: this row was reached and labelled (carries who/when even if empty).
144
+ emit("label", "", "")
145
+
146
+ for cls in ann.get("classifications") or []:
147
+ kind, value = _classification_value(cls)
148
+ emit(kind, cls.get("name"), value)
149
+
150
+ for obj in ann.get("objects") or []:
151
+ kind, value = _object_geometry(obj)
152
+ feature_id = obj.get("feature_id") or obj.get("feature_schema_id") or ""
153
+ emit(kind, obj.get("name"), value)
154
+ for cls in obj.get("classifications") or []:
155
+ ckind, cvalue = _classification_value(cls)
156
+ emit(ckind, cls.get("name"), cvalue, parent=feature_id)
157
+
158
+ for rel in ann.get("relationships") or []:
159
+ value = json.dumps(rel.get("relationship") or {}, sort_keys=True)
160
+ emit("relationship", rel.get("name"), value)
161
+
162
+ return rows
163
+
164
+
165
+ @dataclass(frozen=True)
166
+ class Summary:
167
+ """Triage view of a pull: how much came back, of what kind, how fresh."""
168
+
169
+ n_data_rows: int
170
+ n_labelled: int
171
+ n_reached_unlabelled: int
172
+ feature_kinds: dict[str, int]
173
+ feature_names: dict[str, int]
174
+ statuses: dict[str, int]
175
+ latest_created_at: str | None
176
+
177
+
178
+ def summarize(rows: Iterable[JsonDict], features: Iterable[FeatureRow]) -> Summary:
179
+ """Count data rows, labelled rows, and per-kind/name/status breakdowns."""
180
+ rows = list(rows)
181
+ features = list(features)
182
+ labelled_keys = {f.global_key for f in features}
183
+ kinds: dict[str, int] = {}
184
+ names: dict[str, int] = {}
185
+ statuses: dict[str, int] = {}
186
+ latest: str | None = None
187
+ for f in features:
188
+ if f.feature_kind == "label":
189
+ if f.workflow_status:
190
+ statuses[f.workflow_status] = statuses.get(f.workflow_status, 0) + 1
191
+ if f.created_at and (latest is None or f.created_at > latest):
192
+ latest = f.created_at
193
+ continue
194
+ kinds[f.feature_kind] = kinds.get(f.feature_kind, 0) + 1
195
+ if f.feature_name:
196
+ names[f.feature_name] = names.get(f.feature_name, 0) + 1
197
+ n_data_rows = len(rows)
198
+ n_labelled = len(labelled_keys)
199
+ return Summary(
200
+ n_data_rows=n_data_rows,
201
+ n_labelled=n_labelled,
202
+ n_reached_unlabelled=max(n_data_rows - n_labelled, 0),
203
+ feature_kinds=kinds,
204
+ feature_names=names,
205
+ statuses=statuses,
206
+ latest_created_at=latest,
207
+ )
208
+
209
+
210
+ # --- internals -------------------------------------------------------------
211
+
212
+
213
+ def _make_client(api_key: str | None) -> Any:
214
+ try:
215
+ import labelbox as lb # noqa: PLC0415 (optional dep, imported only for live pulls)
216
+ except ImportError as exc: # pragma: no cover - exercised only without the SDK
217
+ raise RuntimeError(
218
+ "a live pull needs the Labelbox SDK: pip install 'labelpull[live]'"
219
+ ) from exc
220
+ key = api_key or os.environ.get("LABELBOX_API_KEY")
221
+ if not key:
222
+ raise RuntimeError(
223
+ "no Labelbox API key: pass api_key=... or set LABELBOX_API_KEY "
224
+ "(or use a saved export with read_export_file)"
225
+ )
226
+ return lb.Client(api_key=key)
227
+
228
+
229
+ def _select_project(dr: JsonDict, project_id: str | None) -> JsonDict:
230
+ projects = dr.get("projects") or {}
231
+ if project_id is not None:
232
+ return projects.get(project_id) or {}
233
+ if len(projects) == 1:
234
+ return next(iter(projects.values()))
235
+ return {} # ambiguous: caller must name the project
236
+
237
+
238
+ def _latest_label(proj: JsonDict) -> JsonDict | None:
239
+ # A QC-reviewed row carries the annotator's label *and* the reviewer's; the
240
+ # verified answer is the most recently created, not labels[0].
241
+ labels = proj.get("labels") or []
242
+ if not labels:
243
+ return None
244
+ return cast("JsonDict", max(labels, key=_created_at_of_label))
245
+
246
+
247
+ def _created_at_of_label(label: JsonDict) -> str:
248
+ return (label.get("label_details") or {}).get("created_at") or ""
249
+
250
+
251
+ def _created_at(proj: JsonDict) -> str:
252
+ label = _latest_label(proj)
253
+ return _created_at_of_label(label) if label else ""
254
+
255
+
256
+ def _workflow_status(proj: JsonDict) -> str | None:
257
+ details = proj.get("project_details") or {}
258
+ status = details.get("workflow_status")
259
+ if status is None:
260
+ queue = details.get("task_queue_name") or details.get("task_queue_status")
261
+ status = "Done" if queue == "Done" else queue
262
+ return status
263
+
264
+
265
+ def _classification_value(cls: JsonDict) -> tuple[str, str]:
266
+ if cls.get("radio_answer"):
267
+ answer = cls["radio_answer"]
268
+ return "radio", answer.get("value") or answer.get("name") or ""
269
+ if cls.get("checklist_answers") is not None:
270
+ values = [a.get("value") or a.get("name") or "" for a in cls["checklist_answers"]]
271
+ return "checklist", ";".join(v for v in values if v)
272
+ if cls.get("text_answer") is not None:
273
+ return "text", (cls["text_answer"] or {}).get("content") or ""
274
+ return "unknown", ""
275
+
276
+
277
+ def _object_geometry(obj: JsonDict) -> tuple[str, str]:
278
+ for kind in _GEOMETRY_KINDS:
279
+ geom = obj.get(kind)
280
+ if geom is None:
281
+ continue
282
+ if kind == "mask":
283
+ return "mask", (geom or {}).get("url") or ""
284
+ return kind, json.dumps(geom, sort_keys=True)
285
+ return "unknown", ""
File without changes
@@ -0,0 +1,2 @@
1
+ {"data_row": {"id": "dr_10", "global_key": "scene_a.png"}, "projects": {"proj_y": {"labels": [{"label_details": {"created_at": "2026-06-05T08:00:00Z", "created_by": "labeler@x.org"}, "annotations": {"classifications": [{"name": "Caption", "text_answer": {"content": "two crowns"}}], "objects": [{"feature_id": "f_box1", "name": "Plant", "bounding_box": {"top": 10, "left": 20, "height": 30, "width": 40}, "classifications": [{"name": "Species", "radio_answer": {"value": "Cecropia"}}]}, {"feature_id": "f_poly1", "name": "Canopy", "polygon": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 5, "y": 6}], "classifications": []}]}}], "project_details": {"workflow_status": "Done"}}}}
2
+ {"data_row": {"id": "dr_11", "global_key": "scene_b.png"}, "projects": {"proj_y": {"labels": [{"label_details": {"created_at": "2026-06-06T08:00:00Z", "created_by": "labeler@x.org"}, "annotations": {"classifications": [], "objects": [{"feature_id": "f_mask1", "name": "Leaf", "mask": {"url": "https://api.labelbox.com/masks/abc.png"}, "classifications": []}, {"feature_id": "f_pt1", "name": "Tip", "point": {"x": 7, "y": 8}, "classifications": []}]}}], "project_details": {"workflow_status": "InReview"}}}}
@@ -0,0 +1,4 @@
1
+ {"data_row": {"id": "dr_1", "global_key": "photo_a.JPG"}, "projects": {"proj_x": {"labels": [{"label_details": {"created_at": "2026-06-01T10:00:00Z", "created_by": "ann@bci.org"}, "annotations": {"classifications": [{"name": "Taxon", "radio_answer": {"value": "Ficus insipida", "name": "Ficus insipida"}}, {"name": "Organs", "checklist_answers": [{"value": "leaf"}, {"value": "flower"}]}], "objects": []}}], "project_details": {"workflow_status": "InReview"}}}}
2
+ {"data_row": {"id": "dr_2", "global_key": "photo_b.JPG"}, "projects": {"proj_x": {"labels": [{"label_details": {"created_at": "2026-06-02T09:00:00Z", "created_by": "ann@bci.org"}, "annotations": {"classifications": [{"name": "Taxon", "radio_answer": {"value": "Apeiba membranacea"}}], "objects": []}}, {"label_details": {"created_at": "2026-06-03T14:00:00Z", "created_by": "reviewer@bci.org"}, "annotations": {"classifications": [{"name": "Taxon", "radio_answer": {"value": "Apeiba tibourbou"}}, {"name": "Organs", "checklist_answers": [{"value": "fruit"}]}], "objects": []}}], "project_details": {"workflow_status": "Done"}}}}
3
+ {"data_row": {"id": "dr_3", "global_key": "photo_c.JPG"}, "projects": {"proj_x": {"labels": [{"label_details": {"created_at": "2026-06-01T12:00:00Z", "created_by": "ann@bci.org"}, "annotations": {"classifications": [], "objects": []}}], "project_details": {"task_queue_name": "Done"}}}}
4
+ {"data_row": {"id": "dr_4", "global_key": "photo_d.JPG"}, "projects": {"proj_x": {"labels": [], "project_details": {"workflow_status": "ToLabel"}}}}
@@ -0,0 +1,66 @@
1
+ """Adapter tests: generic long CSV + the species reference adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from pathlib import Path
7
+
8
+ from labelpull.adapters import ADAPTERS, GenericAdapter, SpeciesAdapter, write_csv
9
+ from labelpull.core import flatten, read_export_file
10
+
11
+ FIXTURES = Path(__file__).parent / "fixtures"
12
+
13
+
14
+ def _features(name: str, project_id: str) -> list:
15
+ rows = read_export_file(FIXTURES / name)
16
+ return [f for r in rows for f in flatten(r, project_id)]
17
+
18
+
19
+ def _read(path: Path) -> list[list[str]]:
20
+ with path.open(newline="") as f:
21
+ return list(csv.reader(f))
22
+
23
+
24
+ def test_registry_keys() -> None:
25
+ assert set(ADAPTERS) == {"generic", "species"}
26
+
27
+
28
+ def test_generic_long_csv_roundtrips_every_feature(tmp_path: Path) -> None:
29
+ feats = _features("boxes_masks_export.ndjson", "proj_y")
30
+ out = write_csv(tmp_path / "g.csv", GenericAdapter(), feats)
31
+ rows = _read(out)
32
+ assert rows[0] == list(GenericAdapter.columns)
33
+ assert len(rows) - 1 == len(feats) # one CSV row per feature row
34
+ # The nested species radio carries its parent box id in the long format.
35
+ species = [r for r in rows if r[3] == "Species"][0]
36
+ assert species[2] == "radio" and species[4] == "Cecropia" and species[8] == "f_box1"
37
+
38
+
39
+ def test_species_adapter_wide_csv(tmp_path: Path) -> None:
40
+ feats = _features("species_export.ndjson", "proj_x")
41
+ out = write_csv(tmp_path / "s.csv", SpeciesAdapter(), feats)
42
+ rows = _read(out)
43
+ assert rows[0] == ["global_key", "taxon", "organs", "labeled_by", "workflow_status"]
44
+ body = {r[0]: r for r in rows[1:]}
45
+ # photo_a: InReview, leaf;flower
46
+ assert body["photo_a.JPG"] == [
47
+ "photo_a.JPG",
48
+ "Ficus insipida",
49
+ "leaf;flower",
50
+ "ann@bci.org",
51
+ "InReview",
52
+ ]
53
+ # photo_b: reviewer's corrected taxon + fruit, Done
54
+ assert body["photo_b.JPG"] == [
55
+ "photo_b.JPG",
56
+ "Apeiba tibourbou",
57
+ "fruit",
58
+ "reviewer@bci.org",
59
+ "Done",
60
+ ]
61
+ # photo_c: reached + labelled but empty -> taxon "", still present
62
+ assert body["photo_c.JPG"] == ["photo_c.JPG", "", "", "ann@bci.org", "Done"]
63
+ # photo_d: unlabelled -> absent
64
+ assert "photo_d.JPG" not in body
65
+ # insertion order follows the export stream
66
+ assert [r[0] for r in rows[1:]] == ["photo_a.JPG", "photo_b.JPG", "photo_c.JPG"]
@@ -0,0 +1,118 @@
1
+ """CLI + live-export-with-stub tests (no network, no API key)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from pathlib import Path
7
+
8
+ from typer.testing import CliRunner
9
+
10
+ from labelpull.cli import app
11
+ from labelpull.core import export, read_export_file
12
+
13
+ FIXTURES = Path(__file__).parent / "fixtures"
14
+ runner = CliRunner()
15
+
16
+
17
+ class _StubStream:
18
+ def __init__(self, rows: list[dict]) -> None:
19
+ self._rows = [type("R", (), {"json": r})() for r in rows]
20
+
21
+ def __iter__(self):
22
+ return iter(self._rows)
23
+
24
+
25
+ class _StubTask:
26
+ def __init__(self, rows: list[dict]) -> None:
27
+ self._rows = rows
28
+
29
+ def wait_till_done(self) -> None:
30
+ pass
31
+
32
+ def get_buffered_stream(self):
33
+ return _StubStream(self._rows)
34
+
35
+
36
+ class _StubProject:
37
+ def __init__(self, rows: list[dict]) -> None:
38
+ self._rows = rows
39
+ self.last_filters: dict | None = None
40
+
41
+ def export(self, params: dict, filters: dict | None) -> _StubTask:
42
+ self.last_filters = filters
43
+ return _StubTask(self._rows)
44
+
45
+
46
+ class _StubClient:
47
+ def __init__(self, rows: list[dict]) -> None:
48
+ self._project = _StubProject(rows)
49
+
50
+ def get_project(self, project_id: str) -> _StubProject:
51
+ return self._project
52
+
53
+
54
+ def test_export_streams_via_injected_client() -> None:
55
+ rows = read_export_file(FIXTURES / "species_export.ndjson")
56
+ client = _StubClient(rows)
57
+ out = list(export("proj_x", status="Done", client=client))
58
+ assert out == rows
59
+ assert client._project.last_filters == {"workflow_status": "Done"}
60
+
61
+
62
+ def test_export_since_filters_on_latest_label() -> None:
63
+ rows = read_export_file(FIXTURES / "species_export.ndjson")
64
+ out = list(export("proj_x", since="2026-06-02", client=_StubClient(rows)))
65
+ keys = [r["data_row"]["global_key"] for r in out]
66
+ # Only photo_b (latest label 2026-06-03) clears the 2026-06-02 floor.
67
+ assert keys == ["photo_b.JPG"]
68
+
69
+
70
+ def test_cli_pull_offline_generic(tmp_path: Path) -> None:
71
+ out = tmp_path / "labels.csv"
72
+ result = runner.invoke(
73
+ app,
74
+ [
75
+ "pull",
76
+ "proj_y",
77
+ "--from-export",
78
+ str(FIXTURES / "boxes_masks_export.ndjson"),
79
+ "-o",
80
+ str(out),
81
+ ],
82
+ )
83
+ assert result.exit_code == 0, result.output
84
+ assert "labelpull v" in result.output
85
+ assert "kinds:" in result.output
86
+ with out.open(newline="") as f:
87
+ header = next(csv.reader(f))
88
+ assert header[0] == "global_key" and "feature_kind" in header
89
+
90
+
91
+ def test_cli_pull_offline_species(tmp_path: Path) -> None:
92
+ out = tmp_path / "taxa.csv"
93
+ result = runner.invoke(
94
+ app,
95
+ [
96
+ "pull",
97
+ "proj_x",
98
+ "--schema",
99
+ "species",
100
+ "--from-export",
101
+ str(FIXTURES / "species_export.ndjson"),
102
+ "-o",
103
+ str(out),
104
+ ],
105
+ )
106
+ assert result.exit_code == 0, result.output
107
+ with out.open(newline="") as f:
108
+ rows = list(csv.reader(f))
109
+ assert rows[0] == ["global_key", "taxon", "organs", "labeled_by", "workflow_status"]
110
+ assert any(r[1] == "Apeiba tibourbou" for r in rows[1:])
111
+
112
+
113
+ def test_cli_unknown_schema_errors() -> None:
114
+ result = runner.invoke(
115
+ app,
116
+ ["pull", "p", "--schema", "nope", "--from-export", str(FIXTURES / "species_export.ndjson")],
117
+ )
118
+ assert result.exit_code != 0
@@ -0,0 +1,119 @@
1
+ """Engine tests: latest-label selection, status normalization, generic flatten."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from labelpull.core import (
10
+ FeatureRow,
11
+ flatten,
12
+ read_export_file,
13
+ summarize,
14
+ )
15
+
16
+ FIXTURES = Path(__file__).parent / "fixtures"
17
+
18
+
19
+ @pytest.fixture
20
+ def species_rows() -> list[dict]:
21
+ return read_export_file(FIXTURES / "species_export.ndjson")
22
+
23
+
24
+ @pytest.fixture
25
+ def boxes_rows() -> list[dict]:
26
+ return read_export_file(FIXTURES / "boxes_masks_export.ndjson")
27
+
28
+
29
+ def _features(rows: list[dict], project_id: str) -> list[FeatureRow]:
30
+ return [f for r in rows for f in flatten(r, project_id)]
31
+
32
+
33
+ def test_read_export_file_handles_ndjson_and_json_array(tmp_path: Path) -> None:
34
+ ndjson = tmp_path / "a.ndjson"
35
+ ndjson.write_text('{"x": 1}\n{"x": 2}\n')
36
+ assert read_export_file(ndjson) == [{"x": 1}, {"x": 2}]
37
+ arr = tmp_path / "b.json"
38
+ arr.write_text('[{"x": 1}, {"x": 2}]')
39
+ assert read_export_file(arr) == [{"x": 1}, {"x": 2}]
40
+ empty = tmp_path / "c.json"
41
+ empty.write_text("")
42
+ assert read_export_file(empty) == []
43
+
44
+
45
+ def test_latest_label_wins_over_array_order(species_rows: list[dict]) -> None:
46
+ # dr_2 has the annotator's "Apeiba membranacea" first, reviewer's correction second.
47
+ feats = flatten(species_rows[1], "proj_x")
48
+ taxa = [f.value for f in feats if f.feature_name == "Taxon"]
49
+ assert taxa == ["Apeiba tibourbou"] # reviewer's later label, not labels[0]
50
+ assert all(f.labeled_by == "reviewer@bci.org" for f in feats)
51
+
52
+
53
+ def test_workflow_status_falls_back_to_task_queue(species_rows: list[dict]) -> None:
54
+ # dr_3 has no workflow_status, only task_queue_name == "Done".
55
+ feats = flatten(species_rows[2], "proj_x")
56
+ assert feats # reached + labelled (empty annotations) still yields the sentinel
57
+ assert {f.workflow_status for f in feats} == {"Done"}
58
+ assert [f.feature_kind for f in feats] == ["label"] # no classifications/objects
59
+
60
+
61
+ def test_unlabelled_row_yields_nothing(species_rows: list[dict]) -> None:
62
+ assert flatten(species_rows[3], "proj_x") == [] # dr_4 has no labels
63
+
64
+
65
+ def test_checklist_joined_and_radio_value(species_rows: list[dict]) -> None:
66
+ feats = flatten(species_rows[0], "proj_x")
67
+ by_name = {f.feature_name: f for f in feats if f.feature_kind != "label"}
68
+ assert by_name["Taxon"].value == "Ficus insipida"
69
+ assert by_name["Taxon"].feature_kind == "radio"
70
+ assert by_name["Organs"].value == "leaf;flower"
71
+ assert by_name["Organs"].feature_kind == "checklist"
72
+
73
+
74
+ def test_flatten_objects_and_nested_classifications(boxes_rows: list[dict]) -> None:
75
+ feats = flatten(boxes_rows[0], "proj_y")
76
+ kinds = {(f.feature_name, f.feature_kind) for f in feats}
77
+ assert ("Caption", "text") in kinds
78
+ assert ("Plant", "bounding_box") in kinds
79
+ assert ("Canopy", "polygon") in kinds
80
+ # Nested species radio is linked to its parent box feature_id.
81
+ nested = next(f for f in feats if f.feature_name == "Species")
82
+ assert nested.feature_kind == "radio"
83
+ assert nested.value == "Cecropia"
84
+ assert nested.parent_feature_id == "f_box1"
85
+
86
+
87
+ def test_flatten_mask_and_point(boxes_rows: list[dict]) -> None:
88
+ feats = flatten(boxes_rows[1], "proj_y")
89
+ by_name = {f.feature_name: f for f in feats if f.feature_kind != "label"}
90
+ assert by_name["Leaf"].feature_kind == "mask"
91
+ assert by_name["Leaf"].value == "https://api.labelbox.com/masks/abc.png"
92
+ assert by_name["Tip"].feature_kind == "point"
93
+ assert by_name["Tip"].value == '{"x": 7, "y": 8}'
94
+
95
+
96
+ def test_flatten_single_project_inferred_when_id_omitted(boxes_rows: list[dict]) -> None:
97
+ assert flatten(boxes_rows[0]) == flatten(boxes_rows[0], "proj_y")
98
+
99
+
100
+ def test_flatten_ambiguous_multi_project_returns_empty() -> None:
101
+ dr = {
102
+ "data_row": {"id": "d", "global_key": "g"},
103
+ "projects": {
104
+ "a": {"labels": [{"label_details": {}, "annotations": {}}]},
105
+ "b": {"labels": [{"label_details": {}, "annotations": {}}]},
106
+ },
107
+ }
108
+ assert flatten(dr) == [] # two projects, none named -> no silent mixing
109
+
110
+
111
+ def test_summarize_counts(species_rows: list[dict]) -> None:
112
+ feats = _features(species_rows, "proj_x")
113
+ s = summarize(species_rows, feats)
114
+ assert s.n_data_rows == 4
115
+ assert s.n_labelled == 3 # dr_1, dr_2, dr_3 (dr_4 unlabelled)
116
+ assert s.n_reached_unlabelled == 1
117
+ assert s.statuses == {"InReview": 1, "Done": 2}
118
+ assert s.feature_kinds["radio"] == 2
119
+ assert s.latest_created_at == "2026-06-03T14:00:00Z"