PyPI - labelpull - Versions diffs - 0.1.0__tar.gz - Mend

labelpull 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

labelpull-0.1.0/.github/workflows/ci.yml +64 -0
labelpull-0.1.0/.github/workflows/release.yml +39 -0
labelpull-0.1.0/.gitignore +10 -0
labelpull-0.1.0/PKG-INFO +69 -0
labelpull-0.1.0/README.md +52 -0
labelpull-0.1.0/pyproject.toml +43 -0
labelpull-0.1.0/src/labelpull/__init__.py +44 -0
labelpull-0.1.0/src/labelpull/adapters.py +110 -0
labelpull-0.1.0/src/labelpull/cli.py +101 -0
labelpull-0.1.0/src/labelpull/core.py +285 -0
labelpull-0.1.0/src/labelpull/py.typed +0 -0
labelpull-0.1.0/tests/fixtures/boxes_masks_export.ndjson +2 -0
labelpull-0.1.0/tests/fixtures/species_export.ndjson +4 -0
labelpull-0.1.0/tests/test_adapters.py +66 -0
labelpull-0.1.0/tests/test_cli.py +118 -0
labelpull-0.1.0/tests/test_core.py +119 -0

labelpull-0.1.0/.github/workflows/ci.yml ADDED Viewed

@@ -0,0 +1,64 @@
+name: CI
+on:
+  push:
+    branches: [main]
+  pull_request:
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: "**/pyproject.toml"
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+      - name: Install
+        run: |
+          uv venv --python ${{ matrix.python-version }} .venv
+          uv pip install -e ".[dev]"
+      - name: Lint
+        run: |
+          source .venv/bin/activate
+          ruff check src tests
+          ruff format --check src tests
+      - name: Type-check
+        run: |
+          source .venv/bin/activate
+          mypy --strict --no-warn-unused-ignores src/labelpull
+      - name: Test
+        run: |
+          source .venv/bin/activate
+          pytest --cov=labelpull --cov-report=term-missing -q
+  build:
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v3
+      - name: Build sdist + wheel
+        run: uv build
+      - name: Check metadata renders on PyPI
+        run: uvx twine check dist/*
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/

labelpull-0.1.0/.github/workflows/release.yml ADDED Viewed

@@ -0,0 +1,39 @@
+name: Release
+# Publish to PyPI when a version tag is pushed (e.g. `git tag v0.1.0 && git push --tags`).
+# Uses PyPI trusted publishing (OIDC) — no API token secret required. To enable:
+#   1. Create the project on PyPI and add this repo as a trusted publisher
+#      (workflow `release.yml`, environment `pypi`).
+#   2. Create a GitHub environment named `pypi` (optionally with required reviewers).
+on:
+  push:
+    tags: ["v*"]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v3
+      - name: Build sdist + wheel
+        run: uv build
+      - name: Check metadata renders on PyPI
+        run: uvx twine check dist/*
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      id-token: write # OIDC token for PyPI trusted publishing
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - uses: pypa/gh-action-pypi-publish@release/v1

labelpull-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+__pycache__/
+*.pyc
+.venv/
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+dist/
+*.egg-info/
+.coverage
+.claude-flow/

labelpull-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,69 @@
+Metadata-Version: 2.4
+Name: labelpull
+Version: 0.1.0
+Summary: Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
+Author-email: Wietze Suijker <wietze.suijker@gmail.com>
+License-Expression: MIT
+Requires-Python: >=3.10
+Requires-Dist: typer>=0.9
+Provides-Extra: dev
+Requires-Dist: mypy>=1.8; extra == 'dev'
+Requires-Dist: pytest-cov>=4.1; extra == 'dev'
+Requires-Dist: pytest>=7.4; extra == 'dev'
+Requires-Dist: ruff>=0.4; extra == 'dev'
+Provides-Extra: live
+Requires-Dist: labelbox>=7.0; extra == 'live'
+Description-Content-Type: text/markdown
+# labelpull
+Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
+The Labelbox SDK already exports a project's labels and streams them. What it
+doesn't give you is a *tabular* view of that deeply nested JSON, the correctness
+logic to pick the right label when a row was reviewed, or a workflow status that
+is always populated. `labelpull` is exactly that thin layer on top of the SDK.
+## Install
+```bash
+pip install labelpull            # offline parsing + CLI
+pip install 'labelpull[live]'    # + the Labelbox SDK for live pulls
+```
+## CLI
+```bash
+export LABELBOX_API_KEY=...
+labelpull pull <PROJECT_ID> -o labels.csv               # generic long CSV (any ontology)
+labelpull pull <PROJECT_ID> --status Done               # only verified rows
+labelpull pull <PROJECT_ID> --since 2026-06-01          # only the latest labels
+labelpull pull <PROJECT_ID> --from-export export.ndjson # offline, no API key
+labelpull pull <PROJECT_ID> --schema species -o taxa.csv # speciesfirst Taxon/Organs wide CSV
+```
+`--schema generic` (default) writes one row per feature — every classification
+and object, any ontology:
+```
+global_key,data_row_id,feature_kind,feature_name,value,workflow_status,labeled_by,created_at,parent_feature_id
+```
+## Library
+```python
+import labelpull
+rows = list(labelpull.export("proj_id", status="Done"))   # or read_export_file("export.ndjson")
+features = [f for r in rows for f in labelpull.flatten(r, "proj_id")]
+labelpull.write_csv("labels.csv", labelpull.GenericAdapter(), features)
+print(labelpull.summarize(rows, features))
+```
+`flatten()` handles radio / checklist / text classifications and bbox / polygon /
+line / point / mask objects (with nested classifications linked to their parent),
+and always selects the most recently created label so a QC-reviewed row reports
+the reviewer's answer, not the annotator's.
+Write your own `Adapter` to collapse features into a project-specific wide table;
+`SpeciesAdapter` is the reference implementation.

labelpull-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,52 @@
+# labelpull
+Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
+The Labelbox SDK already exports a project's labels and streams them. What it
+doesn't give you is a *tabular* view of that deeply nested JSON, the correctness
+logic to pick the right label when a row was reviewed, or a workflow status that
+is always populated. `labelpull` is exactly that thin layer on top of the SDK.
+## Install
+```bash
+pip install labelpull            # offline parsing + CLI
+pip install 'labelpull[live]'    # + the Labelbox SDK for live pulls
+```
+## CLI
+```bash
+export LABELBOX_API_KEY=...
+labelpull pull <PROJECT_ID> -o labels.csv               # generic long CSV (any ontology)
+labelpull pull <PROJECT_ID> --status Done               # only verified rows
+labelpull pull <PROJECT_ID> --since 2026-06-01          # only the latest labels
+labelpull pull <PROJECT_ID> --from-export export.ndjson # offline, no API key
+labelpull pull <PROJECT_ID> --schema species -o taxa.csv # speciesfirst Taxon/Organs wide CSV
+```
+`--schema generic` (default) writes one row per feature — every classification
+and object, any ontology:
+```
+global_key,data_row_id,feature_kind,feature_name,value,workflow_status,labeled_by,created_at,parent_feature_id
+```
+## Library
+```python
+import labelpull
+rows = list(labelpull.export("proj_id", status="Done"))   # or read_export_file("export.ndjson")
+features = [f for r in rows for f in labelpull.flatten(r, "proj_id")]
+labelpull.write_csv("labels.csv", labelpull.GenericAdapter(), features)
+print(labelpull.summarize(rows, features))
+```
+`flatten()` handles radio / checklist / text classifications and bbox / polygon /
+line / point / mask objects (with nested classifications linked to their parent),
+and always selects the most recently created label so a QC-reviewed row reports
+the reviewer's answer, not the annotator's.
+Write your own `Adapter` to collapse features into a project-specific wide table;
+`SpeciesAdapter` is the reference implementation.

labelpull-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,43 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "labelpull"
+version = "0.1.0"
+description = "Pull the latest Labelbox annotations into a tidy, ontology-agnostic table."
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+authors = [{ name = "Wietze Suijker", email = "wietze.suijker@gmail.com" }]
+dependencies = ["typer>=0.9"]
+[project.optional-dependencies]
+live = ["labelbox>=7.0"]
+dev = ["pytest>=7.4", "pytest-cov>=4.1", "ruff>=0.4", "mypy>=1.8"]
+[project.scripts]
+labelpull = "labelpull.cli:app"
+[tool.hatch.build.targets.wheel]
+packages = ["src/labelpull"]
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "SIM", "PL"]
+[tool.ruff.lint.per-file-ignores]
+# Typer expresses CLI args as call-defaults (B008) and wide signatures (PLR0913).
+"src/labelpull/cli.py" = ["B008", "PLR0913"]
+# Tests assert against literal expected values (PLR2004) and import lazily (PLC0415).
+"tests/*" = ["PLR2004", "PLC0415", "E501"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+[[tool.mypy.overrides]]
+module = "labelbox"
+ignore_missing_imports = true

labelpull-0.1.0/src/labelpull/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""labelpull: pull the latest Labelbox annotations into a tidy table.
+The Labelbox SDK exports a project's labels as nested, ontology-shaped JSON.
+labelpull is the thin layer the SDK lacks: a generic flattener
+(:func:`~labelpull.core.flatten`) plus the correctness logic (latest-label
+selection, status normalization) and a one-command CLI on top.
+"""
+from __future__ import annotations
+from labelpull.adapters import (
+    ADAPTERS,
+    Adapter,
+    GenericAdapter,
+    SpeciesAdapter,
+    write_csv,
+)
+from labelpull.core import (
+    WORKFLOW_STATUSES,
+    FeatureRow,
+    Summary,
+    export,
+    flatten,
+    read_export_file,
+    summarize,
+)
+__version__ = "0.1.0"
+__all__ = [
+    "ADAPTERS",
+    "WORKFLOW_STATUSES",
+    "Adapter",
+    "FeatureRow",
+    "GenericAdapter",
+    "SpeciesAdapter",
+    "Summary",
+    "__version__",
+    "export",
+    "flatten",
+    "read_export_file",
+    "summarize",
+    "write_csv",
+]

labelpull-0.1.0/src/labelpull/adapters.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Adapters: collapse ontology-agnostic :class:`FeatureRow` rows into a shape.
+The generic path writes ``FeatureRow`` rows straight to a long-format CSV that
+any project can read. An adapter narrows that to a project-specific wide record.
+:class:`SpeciesAdapter` is the reference implementation, reproducing
+speciesfirst's ``global_key,taxon,organs,labeled_by,workflow_status`` pull CSV
+from the generic rows, so the engine has exactly one parser.
+"""
+from __future__ import annotations
+import csv
+from collections import OrderedDict
+from collections.abc import Iterable, Sequence
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+from labelpull.core import FeatureRow
+@runtime_checkable
+class Adapter(Protocol):
+    """Map flattened features to named columns plus the rows to write."""
+    columns: Sequence[str]
+    def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]: ...
+class GenericAdapter:
+    """One CSV row per feature: the ontology-agnostic long format."""
+    columns: Sequence[str] = (
+        "global_key",
+        "data_row_id",
+        "feature_kind",
+        "feature_name",
+        "value",
+        "workflow_status",
+        "labeled_by",
+        "created_at",
+        "parent_feature_id",
+    )
+    def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]:
+        for f in features:
+            yield (
+                f.global_key,
+                f.data_row_id,
+                f.feature_kind,
+                f.feature_name,
+                f.value,
+                f.workflow_status or "",
+                f.labeled_by or "",
+                f.created_at or "",
+                f.parent_feature_id,
+            )
+class SpeciesAdapter:
+    """One row per ``global_key``: reproduces speciesfirst's pull CSV.
+    ``taxon`` is the ``Taxon`` single-select radio; ``organs`` is the ``Organs``
+    checklist (``;``-joined). A reached-and-labelled row with neither still
+    appears (seeded by the ``label`` sentinel), matching speciesfirst's "reached
+    but unlabelled yields ``taxon=''``" behaviour. Insertion order follows the
+    export stream.
+    """
+    columns: Sequence[str] = ("global_key", "taxon", "organs", "labeled_by", "workflow_status")
+    taxon_feature = "Taxon"
+    organs_feature = "Organs"
+    def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]:
+        by_key: OrderedDict[str, dict[str, str]] = OrderedDict()
+        for f in features:
+            rec = by_key.setdefault(
+                f.global_key,
+                {"taxon": "", "organs": "", "labeled_by": "", "workflow_status": ""},
+            )
+            if f.labeled_by:
+                rec["labeled_by"] = f.labeled_by
+            if f.workflow_status:
+                rec["workflow_status"] = f.workflow_status
+            if f.feature_kind == "radio" and f.feature_name == self.taxon_feature and f.value:
+                rec["taxon"] = f.value
+            elif f.feature_kind == "checklist" and f.feature_name == self.organs_feature:
+                rec["organs"] = f.value
+        for global_key, rec in by_key.items():
+            yield (
+                global_key,
+                rec["taxon"],
+                rec["organs"],
+                rec["labeled_by"],
+                rec["workflow_status"],
+            )
+ADAPTERS: dict[str, type] = {"generic": GenericAdapter, "species": SpeciesAdapter}
+def write_csv(path: str | Path, adapter: Adapter, features: Iterable[FeatureRow]) -> Path:
+    """Write ``features`` through ``adapter`` to ``path`` (parents created)."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(adapter.columns)
+        writer.writerows(adapter.rows(features))
+    return path

labelpull-0.1.0/src/labelpull/cli.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""``labelpull`` CLI: pull the latest Labelbox annotations to a tidy CSV.
+labelpull pull PROJECT_ID -o labels.csv
+labelpull pull PROJECT_ID --status Done --since 2026-06-01
+labelpull pull PROJECT_ID --schema species -o taxa.csv
+labelpull pull PROJECT_ID --from-export export.ndjson   # offline, no API key
+"""
+from __future__ import annotations
+from pathlib import Path
+import typer
+from labelpull import __version__
+from labelpull.adapters import ADAPTERS, write_csv
+from labelpull.core import (
+    FeatureRow,
+    JsonDict,
+    _created_at,
+    _select_project,
+    flatten,
+    read_export_file,
+    summarize,
+)
+from labelpull.core import export as live_export
+app = typer.Typer(add_completion=False, help="Pull the latest Labelbox annotations to CSV.")
+@app.callback()
+def _main() -> None:
+    """labelpull: pull the latest Labelbox annotations into a tidy table."""
+@app.command()
+def pull(
+    project_id: str = typer.Argument(..., help="Labelbox project id to export from."),
+    out: Path = typer.Option(
+        Path("pulled_labels.csv"), "--out", "-o", help="Where to write the CSV."
+    ),
+    schema: str = typer.Option(
+        "generic",
+        help="generic = one row per feature (any ontology); "
+        "species = speciesfirst Taxon/Organs wide CSV.",
+    ),
+    status: str | None = typer.Option(
+        None, help="Filter by task-queue stage: ToLabel | InReview | InRework | Done."
+    ),
+    since: str | None = typer.Option(
+        None, help="Keep only rows whose newest label was created on/after this ISO date/time."
+    ),
+    from_export: Path | None = typer.Option(
+        None,
+        exists=True,
+        dir_okay=False,
+        help="Flatten a saved export (JSON/NDJSON) offline instead of the live API.",
+    ),
+    api_key: str | None = typer.Option(None, help="Labelbox API key (else LABELBOX_API_KEY)."),
+) -> None:
+    """Export the latest annotations and flatten them to CSV, with a summary."""
+    if schema not in ADAPTERS:
+        raise typer.BadParameter(f"unknown schema {schema!r}; choose from {sorted(ADAPTERS)}")
+    adapter = ADAPTERS[schema]()
+    typer.echo(f"labelpull v{__version__}")
+    if from_export is not None:
+        rows = read_export_file(from_export)
+        if since is not None:
+            rows = [r for r in rows if _row_since(r, project_id, since)]
+        typer.echo(f"  read {len(rows)} rows from {from_export}")
+    else:
+        rows = list(live_export(project_id, status=status, since=since, api_key=api_key))
+        typer.echo(f"  exported {len(rows)} rows from project {project_id}")
+    features = [f for r in rows for f in flatten(r, project_id)]
+    _print_summary(rows, features)
+    write_csv(out, adapter, features)
+    typer.echo(f"wrote {schema} CSV: {out}")
+def _row_since(dr: JsonDict, project_id: str, since: str) -> bool:
+    return _created_at(_select_project(dr, project_id)) >= since
+def _print_summary(rows: list[JsonDict], features: list[FeatureRow]) -> None:
+    s = summarize(rows, features)
+    typer.echo(
+        f"  {s.n_labelled} labelled / {s.n_data_rows} rows "
+        f"({s.n_reached_unlabelled} reached unlabelled)"
+    )
+    if s.statuses:
+        typer.echo("  status: " + ", ".join(f"{k}={v}" for k, v in sorted(s.statuses.items())))
+    if s.feature_kinds:
+        typer.echo("  kinds:  " + ", ".join(f"{k}={v}" for k, v in sorted(s.feature_kinds.items())))
+    if s.latest_created_at:
+        typer.echo(f"  latest label: {s.latest_created_at}")
+if __name__ == "__main__":  # pragma: no cover
+    app()

labelpull-0.1.0/src/labelpull/core.py ADDED Viewed

@@ -0,0 +1,285 @@
+"""Ontology-agnostic Labelbox export + flatten.
+The Labelbox SDK already exports a project's labels and streams them as deeply
+nested, ontology-shaped JSON. What it does *not* give you is a tabular view, the
+correctness logic to pick the right label when a row was reviewed, or a workflow
+status that is always populated. This module is exactly that thin layer:
+* :func:`export` wraps ``project.export(...)`` + ``wait_till_done()`` +
+  ``get_buffered_stream()`` (SDK lazy-imported, so it is optional) and adds a
+  ``since`` filter for "only the latest annotations".
+* :func:`flatten` turns one export row into :class:`FeatureRow` long-format rows,
+  covering *every* feature kind (classifications AND objects) without assuming a
+  particular ontology. It encodes the two traps a hand-written parser gets wrong:
+  selecting the most-recently-created label (a QC-reviewed row carries both the
+  annotator's and the reviewer's label) and normalizing the workflow status.
+* :func:`read_export_file` parses a saved export (UI download or a prior pull) so
+  the same flattener runs offline, no API key required.
+"""
+from __future__ import annotations
+import json
+import os
+from collections.abc import Iterable, Iterator
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, cast
+# One export row (and its nested blocks) is arbitrary JSON; alias it for brevity.
+JsonDict = dict[str, Any]
+# The task-queue stages ``project.export(filters={"workflow_status": ...})`` accepts.
+WORKFLOW_STATUSES = ("ToLabel", "InReview", "InRework", "Done")
+# Geometry keys a localized object may carry in the v7 export, in probe order.
+_GEOMETRY_KINDS = ("bounding_box", "polygon", "line", "point", "mask")
+@dataclass(frozen=True)
+class FeatureRow:
+    """One ``(label, feature)`` pair from an export row, ontology-agnostic.
+    A classification answer or a localized object. An object's nested
+    classifications become their own rows, linked to the object via
+    :attr:`parent_feature_id`. Each labelled data row also yields one
+    ``feature_kind="label"`` sentinel row (no feature, ``value=""``) so that a
+    reached-and-labelled row is always represented even when empty.
+    """
+    global_key: str
+    data_row_id: str
+    # one of: label, radio, checklist, text, bounding_box, polygon,
+    # line, point, mask, relationship, unknown
+    feature_kind: str
+    feature_name: str
+    value: str  # answer value(s) / compact geometry; "" when none
+    workflow_status: str | None
+    labeled_by: str | None
+    created_at: str | None
+    parent_feature_id: str  # "" for top-level features
+def export(
+    project_id: str,
+    *,
+    status: str | None = None,
+    since: str | None = None,
+    api_key: str | None = None,
+    client: Any | None = None,
+) -> Iterator[JsonDict]:
+    """Stream export rows (one dict per data row) for ``project_id``.
+    ``status`` filters by task-queue stage (see :data:`WORKFLOW_STATUSES`).
+    ``since`` keeps only rows whose newest label was created on/after an ISO
+    date/datetime string (lexicographic compare on the ISO timestamp). Pass
+    ``client`` to inject a stub; otherwise the ``labelbox`` SDK is imported
+    lazily and a client is built from ``api_key`` or ``LABELBOX_API_KEY``.
+    """
+    cl = client if client is not None else _make_client(api_key)
+    project = cl.get_project(project_id)
+    filters = {"workflow_status": status} if status else None
+    task = project.export(
+        params={"data_row_details": True, "label_details": True, "project_details": True},
+        filters=filters,
+    )
+    task.wait_till_done()
+    for row in task.get_buffered_stream():
+        dr = row.json
+        if since is None or _created_at(_select_project(dr, project_id)) >= since:
+            yield dr
+def read_export_file(path: str | Path) -> list[JsonDict]:
+    """Load a saved export (JSON array or NDJSON) for offline flattening."""
+    text = Path(path).read_text().strip()
+    if not text:
+        return []
+    try:
+        loaded = json.loads(text)
+        return loaded if isinstance(loaded, list) else [loaded]
+    except json.JSONDecodeError:
+        return [json.loads(line) for line in text.splitlines() if line.strip()]
+def flatten(dr: JsonDict, project_id: str | None = None) -> list[FeatureRow]:
+    """Flatten one export row into :class:`FeatureRow` rows (every feature).
+    ``project_id`` selects which project's labels to read; ``None`` uses the only
+    project present (the common single-project export) and returns nothing if the
+    row is ambiguous (multiple projects) so a caller never silently mixes them.
+    An unreached or unlabelled row yields ``[]``.
+    """
+    data_row = dr.get("data_row") or {}
+    global_key = data_row.get("global_key") or ""
+    data_row_id = data_row.get("id") or global_key
+    proj = _select_project(dr, project_id)
+    label = _latest_label(proj)
+    if not global_key or label is None:
+        return []
+    status = _workflow_status(proj)
+    details = label.get("label_details") or {}
+    labeled_by = details.get("created_by")
+    created_at = details.get("created_at")
+    ann = label.get("annotations") or {}
+    rows: list[FeatureRow] = []
+    def emit(kind: str, name: str | None, value: str, parent: str = "") -> None:
+        rows.append(
+            FeatureRow(
+                global_key,
+                data_row_id,
+                kind,
+                name or "",
+                value,
+                status,
+                labeled_by,
+                created_at,
+                parent,
+            )
+        )
+    # Sentinel: this row was reached and labelled (carries who/when even if empty).
+    emit("label", "", "")
+    for cls in ann.get("classifications") or []:
+        kind, value = _classification_value(cls)
+        emit(kind, cls.get("name"), value)
+    for obj in ann.get("objects") or []:
+        kind, value = _object_geometry(obj)
+        feature_id = obj.get("feature_id") or obj.get("feature_schema_id") or ""
+        emit(kind, obj.get("name"), value)
+        for cls in obj.get("classifications") or []:
+            ckind, cvalue = _classification_value(cls)
+            emit(ckind, cls.get("name"), cvalue, parent=feature_id)
+    for rel in ann.get("relationships") or []:
+        value = json.dumps(rel.get("relationship") or {}, sort_keys=True)
+        emit("relationship", rel.get("name"), value)
+    return rows
+@dataclass(frozen=True)
+class Summary:
+    """Triage view of a pull: how much came back, of what kind, how fresh."""
+    n_data_rows: int
+    n_labelled: int
+    n_reached_unlabelled: int
+    feature_kinds: dict[str, int]
+    feature_names: dict[str, int]
+    statuses: dict[str, int]
+    latest_created_at: str | None
+def summarize(rows: Iterable[JsonDict], features: Iterable[FeatureRow]) -> Summary:
+    """Count data rows, labelled rows, and per-kind/name/status breakdowns."""
+    rows = list(rows)
+    features = list(features)
+    labelled_keys = {f.global_key for f in features}
+    kinds: dict[str, int] = {}
+    names: dict[str, int] = {}
+    statuses: dict[str, int] = {}
+    latest: str | None = None
+    for f in features:
+        if f.feature_kind == "label":
+            if f.workflow_status:
+                statuses[f.workflow_status] = statuses.get(f.workflow_status, 0) + 1
+            if f.created_at and (latest is None or f.created_at > latest):
+                latest = f.created_at
+            continue
+        kinds[f.feature_kind] = kinds.get(f.feature_kind, 0) + 1
+        if f.feature_name:
+            names[f.feature_name] = names.get(f.feature_name, 0) + 1
+    n_data_rows = len(rows)
+    n_labelled = len(labelled_keys)
+    return Summary(
+        n_data_rows=n_data_rows,
+        n_labelled=n_labelled,
+        n_reached_unlabelled=max(n_data_rows - n_labelled, 0),
+        feature_kinds=kinds,
+        feature_names=names,
+        statuses=statuses,
+        latest_created_at=latest,
+    )
+# --- internals -------------------------------------------------------------
+def _make_client(api_key: str | None) -> Any:
+    try:
+        import labelbox as lb  # noqa: PLC0415 (optional dep, imported only for live pulls)
+    except ImportError as exc:  # pragma: no cover - exercised only without the SDK
+        raise RuntimeError(
+            "a live pull needs the Labelbox SDK: pip install 'labelpull[live]'"
+        ) from exc
+    key = api_key or os.environ.get("LABELBOX_API_KEY")
+    if not key:
+        raise RuntimeError(
+            "no Labelbox API key: pass api_key=... or set LABELBOX_API_KEY "
+            "(or use a saved export with read_export_file)"
+        )
+    return lb.Client(api_key=key)
+def _select_project(dr: JsonDict, project_id: str | None) -> JsonDict:
+    projects = dr.get("projects") or {}
+    if project_id is not None:
+        return projects.get(project_id) or {}
+    if len(projects) == 1:
+        return next(iter(projects.values()))
+    return {}  # ambiguous: caller must name the project
+def _latest_label(proj: JsonDict) -> JsonDict | None:
+    # A QC-reviewed row carries the annotator's label *and* the reviewer's; the
+    # verified answer is the most recently created, not labels[0].
+    labels = proj.get("labels") or []
+    if not labels:
+        return None
+    return cast("JsonDict", max(labels, key=_created_at_of_label))
+def _created_at_of_label(label: JsonDict) -> str:
+    return (label.get("label_details") or {}).get("created_at") or ""
+def _created_at(proj: JsonDict) -> str:
+    label = _latest_label(proj)
+    return _created_at_of_label(label) if label else ""
+def _workflow_status(proj: JsonDict) -> str | None:
+    details = proj.get("project_details") or {}
+    status = details.get("workflow_status")
+    if status is None:
+        queue = details.get("task_queue_name") or details.get("task_queue_status")
+        status = "Done" if queue == "Done" else queue
+    return status
+def _classification_value(cls: JsonDict) -> tuple[str, str]:
+    if cls.get("radio_answer"):
+        answer = cls["radio_answer"]
+        return "radio", answer.get("value") or answer.get("name") or ""
+    if cls.get("checklist_answers") is not None:
+        values = [a.get("value") or a.get("name") or "" for a in cls["checklist_answers"]]
+        return "checklist", ";".join(v for v in values if v)
+    if cls.get("text_answer") is not None:
+        return "text", (cls["text_answer"] or {}).get("content") or ""
+    return "unknown", ""
+def _object_geometry(obj: JsonDict) -> tuple[str, str]:
+    for kind in _GEOMETRY_KINDS:
+        geom = obj.get(kind)
+        if geom is None:
+            continue
+        if kind == "mask":
+            return "mask", (geom or {}).get("url") or ""
+        return kind, json.dumps(geom, sort_keys=True)
+    return "unknown", ""

labelpull-0.1.0/src/labelpull/py.typed ADDED Viewed

File without changes

labelpull-0.1.0/tests/fixtures/boxes_masks_export.ndjson ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ {"data_row": {"id": "dr_10", "global_key": "scene_a.png"}, "projects": {"proj_y": {"labels": [{"label_details": {"created_at": "2026-06-05T08:00:00Z", "created_by": "labeler@x.org"}, "annotations": {"classifications": [{"name": "Caption", "text_answer": {"content": "two crowns"}}], "objects": [{"feature_id": "f_box1", "name": "Plant", "bounding_box": {"top": 10, "left": 20, "height": 30, "width": 40}, "classifications": [{"name": "Species", "radio_answer": {"value": "Cecropia"}}]}, {"feature_id": "f_poly1", "name": "Canopy", "polygon": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 5, "y": 6}], "classifications": []}]}}], "project_details": {"workflow_status": "Done"}}}}
2	+ {"data_row": {"id": "dr_11", "global_key": "scene_b.png"}, "projects": {"proj_y": {"labels": [{"label_details": {"created_at": "2026-06-06T08:00:00Z", "created_by": "labeler@x.org"}, "annotations": {"classifications": [], "objects": [{"feature_id": "f_mask1", "name": "Leaf", "mask": {"url": "https://api.labelbox.com/masks/abc.png"}, "classifications": []}, {"feature_id": "f_pt1", "name": "Tip", "point": {"x": 7, "y": 8}, "classifications": []}]}}], "project_details": {"workflow_status": "InReview"}}}}

labelpull-0.1.0/tests/fixtures/species_export.ndjson ADDED Viewed

@@ -0,0 +1,4 @@
+{"data_row": {"id": "dr_1", "global_key": "photo_a.JPG"}, "projects": {"proj_x": {"labels": [{"label_details": {"created_at": "2026-06-01T10:00:00Z", "created_by": "ann@bci.org"}, "annotations": {"classifications": [{"name": "Taxon", "radio_answer": {"value": "Ficus insipida", "name": "Ficus insipida"}}, {"name": "Organs", "checklist_answers": [{"value": "leaf"}, {"value": "flower"}]}], "objects": []}}], "project_details": {"workflow_status": "InReview"}}}}
+{"data_row": {"id": "dr_2", "global_key": "photo_b.JPG"}, "projects": {"proj_x": {"labels": [{"label_details": {"created_at": "2026-06-02T09:00:00Z", "created_by": "ann@bci.org"}, "annotations": {"classifications": [{"name": "Taxon", "radio_answer": {"value": "Apeiba membranacea"}}], "objects": []}}, {"label_details": {"created_at": "2026-06-03T14:00:00Z", "created_by": "reviewer@bci.org"}, "annotations": {"classifications": [{"name": "Taxon", "radio_answer": {"value": "Apeiba tibourbou"}}, {"name": "Organs", "checklist_answers": [{"value": "fruit"}]}], "objects": []}}], "project_details": {"workflow_status": "Done"}}}}
+{"data_row": {"id": "dr_3", "global_key": "photo_c.JPG"}, "projects": {"proj_x": {"labels": [{"label_details": {"created_at": "2026-06-01T12:00:00Z", "created_by": "ann@bci.org"}, "annotations": {"classifications": [], "objects": []}}], "project_details": {"task_queue_name": "Done"}}}}
+{"data_row": {"id": "dr_4", "global_key": "photo_d.JPG"}, "projects": {"proj_x": {"labels": [], "project_details": {"workflow_status": "ToLabel"}}}}

labelpull-0.1.0/tests/test_adapters.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Adapter tests: generic long CSV + the species reference adapter."""
+from __future__ import annotations
+import csv
+from pathlib import Path
+from labelpull.adapters import ADAPTERS, GenericAdapter, SpeciesAdapter, write_csv
+from labelpull.core import flatten, read_export_file
+FIXTURES = Path(__file__).parent / "fixtures"
+def _features(name: str, project_id: str) -> list:
+    rows = read_export_file(FIXTURES / name)
+    return [f for r in rows for f in flatten(r, project_id)]
+def _read(path: Path) -> list[list[str]]:
+    with path.open(newline="") as f:
+        return list(csv.reader(f))
+def test_registry_keys() -> None:
+    assert set(ADAPTERS) == {"generic", "species"}
+def test_generic_long_csv_roundtrips_every_feature(tmp_path: Path) -> None:
+    feats = _features("boxes_masks_export.ndjson", "proj_y")
+    out = write_csv(tmp_path / "g.csv", GenericAdapter(), feats)
+    rows = _read(out)
+    assert rows[0] == list(GenericAdapter.columns)
+    assert len(rows) - 1 == len(feats)  # one CSV row per feature row
+    # The nested species radio carries its parent box id in the long format.
+    species = [r for r in rows if r[3] == "Species"][0]
+    assert species[2] == "radio" and species[4] == "Cecropia" and species[8] == "f_box1"
+def test_species_adapter_wide_csv(tmp_path: Path) -> None:
+    feats = _features("species_export.ndjson", "proj_x")
+    out = write_csv(tmp_path / "s.csv", SpeciesAdapter(), feats)
+    rows = _read(out)
+    assert rows[0] == ["global_key", "taxon", "organs", "labeled_by", "workflow_status"]
+    body = {r[0]: r for r in rows[1:]}
+    # photo_a: InReview, leaf;flower
+    assert body["photo_a.JPG"] == [
+        "photo_a.JPG",
+        "Ficus insipida",
+        "leaf;flower",
+        "ann@bci.org",
+        "InReview",
+    ]
+    # photo_b: reviewer's corrected taxon + fruit, Done
+    assert body["photo_b.JPG"] == [
+        "photo_b.JPG",
+        "Apeiba tibourbou",
+        "fruit",
+        "reviewer@bci.org",
+        "Done",
+    ]
+    # photo_c: reached + labelled but empty -> taxon "", still present
+    assert body["photo_c.JPG"] == ["photo_c.JPG", "", "", "ann@bci.org", "Done"]
+    # photo_d: unlabelled -> absent
+    assert "photo_d.JPG" not in body
+    # insertion order follows the export stream
+    assert [r[0] for r in rows[1:]] == ["photo_a.JPG", "photo_b.JPG", "photo_c.JPG"]

labelpull-0.1.0/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""CLI + live-export-with-stub tests (no network, no API key)."""
+from __future__ import annotations
+import csv
+from pathlib import Path
+from typer.testing import CliRunner
+from labelpull.cli import app
+from labelpull.core import export, read_export_file
+FIXTURES = Path(__file__).parent / "fixtures"
+runner = CliRunner()
+class _StubStream:
+    def __init__(self, rows: list[dict]) -> None:
+        self._rows = [type("R", (), {"json": r})() for r in rows]
+    def __iter__(self):
+        return iter(self._rows)
+class _StubTask:
+    def __init__(self, rows: list[dict]) -> None:
+        self._rows = rows
+    def wait_till_done(self) -> None:
+        pass
+    def get_buffered_stream(self):
+        return _StubStream(self._rows)
+class _StubProject:
+    def __init__(self, rows: list[dict]) -> None:
+        self._rows = rows
+        self.last_filters: dict | None = None
+    def export(self, params: dict, filters: dict | None) -> _StubTask:
+        self.last_filters = filters
+        return _StubTask(self._rows)
+class _StubClient:
+    def __init__(self, rows: list[dict]) -> None:
+        self._project = _StubProject(rows)
+    def get_project(self, project_id: str) -> _StubProject:
+        return self._project
+def test_export_streams_via_injected_client() -> None:
+    rows = read_export_file(FIXTURES / "species_export.ndjson")
+    client = _StubClient(rows)
+    out = list(export("proj_x", status="Done", client=client))
+    assert out == rows
+    assert client._project.last_filters == {"workflow_status": "Done"}
+def test_export_since_filters_on_latest_label() -> None:
+    rows = read_export_file(FIXTURES / "species_export.ndjson")
+    out = list(export("proj_x", since="2026-06-02", client=_StubClient(rows)))
+    keys = [r["data_row"]["global_key"] for r in out]
+    # Only photo_b (latest label 2026-06-03) clears the 2026-06-02 floor.
+    assert keys == ["photo_b.JPG"]
+def test_cli_pull_offline_generic(tmp_path: Path) -> None:
+    out = tmp_path / "labels.csv"
+    result = runner.invoke(
+        app,
+        [
+            "pull",
+            "proj_y",
+            "--from-export",
+            str(FIXTURES / "boxes_masks_export.ndjson"),
+            "-o",
+            str(out),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert "labelpull v" in result.output
+    assert "kinds:" in result.output
+    with out.open(newline="") as f:
+        header = next(csv.reader(f))
+    assert header[0] == "global_key" and "feature_kind" in header
+def test_cli_pull_offline_species(tmp_path: Path) -> None:
+    out = tmp_path / "taxa.csv"
+    result = runner.invoke(
+        app,
+        [
+            "pull",
+            "proj_x",
+            "--schema",
+            "species",
+            "--from-export",
+            str(FIXTURES / "species_export.ndjson"),
+            "-o",
+            str(out),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    with out.open(newline="") as f:
+        rows = list(csv.reader(f))
+    assert rows[0] == ["global_key", "taxon", "organs", "labeled_by", "workflow_status"]
+    assert any(r[1] == "Apeiba tibourbou" for r in rows[1:])
+def test_cli_unknown_schema_errors() -> None:
+    result = runner.invoke(
+        app,
+        ["pull", "p", "--schema", "nope", "--from-export", str(FIXTURES / "species_export.ndjson")],
+    )
+    assert result.exit_code != 0

labelpull-0.1.0/tests/test_core.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Engine tests: latest-label selection, status normalization, generic flatten."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from labelpull.core import (
+    FeatureRow,
+    flatten,
+    read_export_file,
+    summarize,
+)
+FIXTURES = Path(__file__).parent / "fixtures"
+@pytest.fixture
+def species_rows() -> list[dict]:
+    return read_export_file(FIXTURES / "species_export.ndjson")
+@pytest.fixture
+def boxes_rows() -> list[dict]:
+    return read_export_file(FIXTURES / "boxes_masks_export.ndjson")
+def _features(rows: list[dict], project_id: str) -> list[FeatureRow]:
+    return [f for r in rows for f in flatten(r, project_id)]
+def test_read_export_file_handles_ndjson_and_json_array(tmp_path: Path) -> None:
+    ndjson = tmp_path / "a.ndjson"
+    ndjson.write_text('{"x": 1}\n{"x": 2}\n')
+    assert read_export_file(ndjson) == [{"x": 1}, {"x": 2}]
+    arr = tmp_path / "b.json"
+    arr.write_text('[{"x": 1}, {"x": 2}]')
+    assert read_export_file(arr) == [{"x": 1}, {"x": 2}]
+    empty = tmp_path / "c.json"
+    empty.write_text("")
+    assert read_export_file(empty) == []
+def test_latest_label_wins_over_array_order(species_rows: list[dict]) -> None:
+    # dr_2 has the annotator's "Apeiba membranacea" first, reviewer's correction second.
+    feats = flatten(species_rows[1], "proj_x")
+    taxa = [f.value for f in feats if f.feature_name == "Taxon"]
+    assert taxa == ["Apeiba tibourbou"]  # reviewer's later label, not labels[0]
+    assert all(f.labeled_by == "reviewer@bci.org" for f in feats)
+def test_workflow_status_falls_back_to_task_queue(species_rows: list[dict]) -> None:
+    # dr_3 has no workflow_status, only task_queue_name == "Done".
+    feats = flatten(species_rows[2], "proj_x")
+    assert feats  # reached + labelled (empty annotations) still yields the sentinel
+    assert {f.workflow_status for f in feats} == {"Done"}
+    assert [f.feature_kind for f in feats] == ["label"]  # no classifications/objects
+def test_unlabelled_row_yields_nothing(species_rows: list[dict]) -> None:
+    assert flatten(species_rows[3], "proj_x") == []  # dr_4 has no labels
+def test_checklist_joined_and_radio_value(species_rows: list[dict]) -> None:
+    feats = flatten(species_rows[0], "proj_x")
+    by_name = {f.feature_name: f for f in feats if f.feature_kind != "label"}
+    assert by_name["Taxon"].value == "Ficus insipida"
+    assert by_name["Taxon"].feature_kind == "radio"
+    assert by_name["Organs"].value == "leaf;flower"
+    assert by_name["Organs"].feature_kind == "checklist"
+def test_flatten_objects_and_nested_classifications(boxes_rows: list[dict]) -> None:
+    feats = flatten(boxes_rows[0], "proj_y")
+    kinds = {(f.feature_name, f.feature_kind) for f in feats}
+    assert ("Caption", "text") in kinds
+    assert ("Plant", "bounding_box") in kinds
+    assert ("Canopy", "polygon") in kinds
+    # Nested species radio is linked to its parent box feature_id.
+    nested = next(f for f in feats if f.feature_name == "Species")
+    assert nested.feature_kind == "radio"
+    assert nested.value == "Cecropia"
+    assert nested.parent_feature_id == "f_box1"
+def test_flatten_mask_and_point(boxes_rows: list[dict]) -> None:
+    feats = flatten(boxes_rows[1], "proj_y")
+    by_name = {f.feature_name: f for f in feats if f.feature_kind != "label"}
+    assert by_name["Leaf"].feature_kind == "mask"
+    assert by_name["Leaf"].value == "https://api.labelbox.com/masks/abc.png"
+    assert by_name["Tip"].feature_kind == "point"
+    assert by_name["Tip"].value == '{"x": 7, "y": 8}'
+def test_flatten_single_project_inferred_when_id_omitted(boxes_rows: list[dict]) -> None:
+    assert flatten(boxes_rows[0]) == flatten(boxes_rows[0], "proj_y")
+def test_flatten_ambiguous_multi_project_returns_empty() -> None:
+    dr = {
+        "data_row": {"id": "d", "global_key": "g"},
+        "projects": {
+            "a": {"labels": [{"label_details": {}, "annotations": {}}]},
+            "b": {"labels": [{"label_details": {}, "annotations": {}}]},
+        },
+    }
+    assert flatten(dr) == []  # two projects, none named -> no silent mixing
+def test_summarize_counts(species_rows: list[dict]) -> None:
+    feats = _features(species_rows, "proj_x")
+    s = summarize(species_rows, feats)
+    assert s.n_data_rows == 4
+    assert s.n_labelled == 3  # dr_1, dr_2, dr_3 (dr_4 unlabelled)
+    assert s.n_reached_unlabelled == 1
+    assert s.statuses == {"InReview": 1, "Done": 2}
+    assert s.feature_kinds["radio"] == 2
+    assert s.latest_created_at == "2026-06-03T14:00:00Z"