labelpull 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- labelpull/__init__.py +44 -0
- labelpull/adapters.py +110 -0
- labelpull/cli.py +101 -0
- labelpull/core.py +285 -0
- labelpull/py.typed +0 -0
- labelpull-0.1.0.dist-info/METADATA +69 -0
- labelpull-0.1.0.dist-info/RECORD +9 -0
- labelpull-0.1.0.dist-info/WHEEL +4 -0
- labelpull-0.1.0.dist-info/entry_points.txt +2 -0
labelpull/__init__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""labelpull: pull the latest Labelbox annotations into a tidy table.
|
|
2
|
+
|
|
3
|
+
The Labelbox SDK exports a project's labels as nested, ontology-shaped JSON.
|
|
4
|
+
labelpull is the thin layer the SDK lacks: a generic flattener
|
|
5
|
+
(:func:`~labelpull.core.flatten`) plus the correctness logic (latest-label
|
|
6
|
+
selection, status normalization) and a one-command CLI on top.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from labelpull.adapters import (
|
|
12
|
+
ADAPTERS,
|
|
13
|
+
Adapter,
|
|
14
|
+
GenericAdapter,
|
|
15
|
+
SpeciesAdapter,
|
|
16
|
+
write_csv,
|
|
17
|
+
)
|
|
18
|
+
from labelpull.core import (
|
|
19
|
+
WORKFLOW_STATUSES,
|
|
20
|
+
FeatureRow,
|
|
21
|
+
Summary,
|
|
22
|
+
export,
|
|
23
|
+
flatten,
|
|
24
|
+
read_export_file,
|
|
25
|
+
summarize,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"ADAPTERS",
|
|
32
|
+
"WORKFLOW_STATUSES",
|
|
33
|
+
"Adapter",
|
|
34
|
+
"FeatureRow",
|
|
35
|
+
"GenericAdapter",
|
|
36
|
+
"SpeciesAdapter",
|
|
37
|
+
"Summary",
|
|
38
|
+
"__version__",
|
|
39
|
+
"export",
|
|
40
|
+
"flatten",
|
|
41
|
+
"read_export_file",
|
|
42
|
+
"summarize",
|
|
43
|
+
"write_csv",
|
|
44
|
+
]
|
labelpull/adapters.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Adapters: collapse ontology-agnostic :class:`FeatureRow` rows into a shape.
|
|
2
|
+
|
|
3
|
+
The generic path writes ``FeatureRow`` rows straight to a long-format CSV that
|
|
4
|
+
any project can read. An adapter narrows that to a project-specific wide record.
|
|
5
|
+
:class:`SpeciesAdapter` is the reference implementation, reproducing
|
|
6
|
+
speciesfirst's ``global_key,taxon,organs,labeled_by,workflow_status`` pull CSV
|
|
7
|
+
from the generic rows, so the engine has exactly one parser.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import csv
|
|
13
|
+
from collections import OrderedDict
|
|
14
|
+
from collections.abc import Iterable, Sequence
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Protocol, runtime_checkable
|
|
17
|
+
|
|
18
|
+
from labelpull.core import FeatureRow
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@runtime_checkable
|
|
22
|
+
class Adapter(Protocol):
|
|
23
|
+
"""Map flattened features to named columns plus the rows to write."""
|
|
24
|
+
|
|
25
|
+
columns: Sequence[str]
|
|
26
|
+
|
|
27
|
+
def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]: ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GenericAdapter:
|
|
31
|
+
"""One CSV row per feature: the ontology-agnostic long format."""
|
|
32
|
+
|
|
33
|
+
columns: Sequence[str] = (
|
|
34
|
+
"global_key",
|
|
35
|
+
"data_row_id",
|
|
36
|
+
"feature_kind",
|
|
37
|
+
"feature_name",
|
|
38
|
+
"value",
|
|
39
|
+
"workflow_status",
|
|
40
|
+
"labeled_by",
|
|
41
|
+
"created_at",
|
|
42
|
+
"parent_feature_id",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]:
|
|
46
|
+
for f in features:
|
|
47
|
+
yield (
|
|
48
|
+
f.global_key,
|
|
49
|
+
f.data_row_id,
|
|
50
|
+
f.feature_kind,
|
|
51
|
+
f.feature_name,
|
|
52
|
+
f.value,
|
|
53
|
+
f.workflow_status or "",
|
|
54
|
+
f.labeled_by or "",
|
|
55
|
+
f.created_at or "",
|
|
56
|
+
f.parent_feature_id,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SpeciesAdapter:
|
|
61
|
+
"""One row per ``global_key``: reproduces speciesfirst's pull CSV.
|
|
62
|
+
|
|
63
|
+
``taxon`` is the ``Taxon`` single-select radio; ``organs`` is the ``Organs``
|
|
64
|
+
checklist (``;``-joined). A reached-and-labelled row with neither still
|
|
65
|
+
appears (seeded by the ``label`` sentinel), matching speciesfirst's "reached
|
|
66
|
+
but unlabelled yields ``taxon=''``" behaviour. Insertion order follows the
|
|
67
|
+
export stream.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
columns: Sequence[str] = ("global_key", "taxon", "organs", "labeled_by", "workflow_status")
|
|
71
|
+
taxon_feature = "Taxon"
|
|
72
|
+
organs_feature = "Organs"
|
|
73
|
+
|
|
74
|
+
def rows(self, features: Iterable[FeatureRow]) -> Iterable[Sequence[str]]:
|
|
75
|
+
by_key: OrderedDict[str, dict[str, str]] = OrderedDict()
|
|
76
|
+
for f in features:
|
|
77
|
+
rec = by_key.setdefault(
|
|
78
|
+
f.global_key,
|
|
79
|
+
{"taxon": "", "organs": "", "labeled_by": "", "workflow_status": ""},
|
|
80
|
+
)
|
|
81
|
+
if f.labeled_by:
|
|
82
|
+
rec["labeled_by"] = f.labeled_by
|
|
83
|
+
if f.workflow_status:
|
|
84
|
+
rec["workflow_status"] = f.workflow_status
|
|
85
|
+
if f.feature_kind == "radio" and f.feature_name == self.taxon_feature and f.value:
|
|
86
|
+
rec["taxon"] = f.value
|
|
87
|
+
elif f.feature_kind == "checklist" and f.feature_name == self.organs_feature:
|
|
88
|
+
rec["organs"] = f.value
|
|
89
|
+
for global_key, rec in by_key.items():
|
|
90
|
+
yield (
|
|
91
|
+
global_key,
|
|
92
|
+
rec["taxon"],
|
|
93
|
+
rec["organs"],
|
|
94
|
+
rec["labeled_by"],
|
|
95
|
+
rec["workflow_status"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
ADAPTERS: dict[str, type] = {"generic": GenericAdapter, "species": SpeciesAdapter}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def write_csv(path: str | Path, adapter: Adapter, features: Iterable[FeatureRow]) -> Path:
|
|
103
|
+
"""Write ``features`` through ``adapter`` to ``path`` (parents created)."""
|
|
104
|
+
path = Path(path)
|
|
105
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
with path.open("w", newline="") as f:
|
|
107
|
+
writer = csv.writer(f)
|
|
108
|
+
writer.writerow(adapter.columns)
|
|
109
|
+
writer.writerows(adapter.rows(features))
|
|
110
|
+
return path
|
labelpull/cli.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""``labelpull`` CLI: pull the latest Labelbox annotations to a tidy CSV.
|
|
2
|
+
|
|
3
|
+
labelpull pull PROJECT_ID -o labels.csv
|
|
4
|
+
labelpull pull PROJECT_ID --status Done --since 2026-06-01
|
|
5
|
+
labelpull pull PROJECT_ID --schema species -o taxa.csv
|
|
6
|
+
labelpull pull PROJECT_ID --from-export export.ndjson # offline, no API key
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import typer
|
|
14
|
+
|
|
15
|
+
from labelpull import __version__
|
|
16
|
+
from labelpull.adapters import ADAPTERS, write_csv
|
|
17
|
+
from labelpull.core import (
|
|
18
|
+
FeatureRow,
|
|
19
|
+
JsonDict,
|
|
20
|
+
_created_at,
|
|
21
|
+
_select_project,
|
|
22
|
+
flatten,
|
|
23
|
+
read_export_file,
|
|
24
|
+
summarize,
|
|
25
|
+
)
|
|
26
|
+
from labelpull.core import export as live_export
|
|
27
|
+
|
|
28
|
+
app = typer.Typer(add_completion=False, help="Pull the latest Labelbox annotations to CSV.")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@app.callback()
|
|
32
|
+
def _main() -> None:
|
|
33
|
+
"""labelpull: pull the latest Labelbox annotations into a tidy table."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.command()
|
|
37
|
+
def pull(
|
|
38
|
+
project_id: str = typer.Argument(..., help="Labelbox project id to export from."),
|
|
39
|
+
out: Path = typer.Option(
|
|
40
|
+
Path("pulled_labels.csv"), "--out", "-o", help="Where to write the CSV."
|
|
41
|
+
),
|
|
42
|
+
schema: str = typer.Option(
|
|
43
|
+
"generic",
|
|
44
|
+
help="generic = one row per feature (any ontology); "
|
|
45
|
+
"species = speciesfirst Taxon/Organs wide CSV.",
|
|
46
|
+
),
|
|
47
|
+
status: str | None = typer.Option(
|
|
48
|
+
None, help="Filter by task-queue stage: ToLabel | InReview | InRework | Done."
|
|
49
|
+
),
|
|
50
|
+
since: str | None = typer.Option(
|
|
51
|
+
None, help="Keep only rows whose newest label was created on/after this ISO date/time."
|
|
52
|
+
),
|
|
53
|
+
from_export: Path | None = typer.Option(
|
|
54
|
+
None,
|
|
55
|
+
exists=True,
|
|
56
|
+
dir_okay=False,
|
|
57
|
+
help="Flatten a saved export (JSON/NDJSON) offline instead of the live API.",
|
|
58
|
+
),
|
|
59
|
+
api_key: str | None = typer.Option(None, help="Labelbox API key (else LABELBOX_API_KEY)."),
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Export the latest annotations and flatten them to CSV, with a summary."""
|
|
62
|
+
if schema not in ADAPTERS:
|
|
63
|
+
raise typer.BadParameter(f"unknown schema {schema!r}; choose from {sorted(ADAPTERS)}")
|
|
64
|
+
adapter = ADAPTERS[schema]()
|
|
65
|
+
|
|
66
|
+
typer.echo(f"labelpull v{__version__}")
|
|
67
|
+
if from_export is not None:
|
|
68
|
+
rows = read_export_file(from_export)
|
|
69
|
+
if since is not None:
|
|
70
|
+
rows = [r for r in rows if _row_since(r, project_id, since)]
|
|
71
|
+
typer.echo(f" read {len(rows)} rows from {from_export}")
|
|
72
|
+
else:
|
|
73
|
+
rows = list(live_export(project_id, status=status, since=since, api_key=api_key))
|
|
74
|
+
typer.echo(f" exported {len(rows)} rows from project {project_id}")
|
|
75
|
+
|
|
76
|
+
features = [f for r in rows for f in flatten(r, project_id)]
|
|
77
|
+
_print_summary(rows, features)
|
|
78
|
+
write_csv(out, adapter, features)
|
|
79
|
+
typer.echo(f"wrote {schema} CSV: {out}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _row_since(dr: JsonDict, project_id: str, since: str) -> bool:
|
|
83
|
+
return _created_at(_select_project(dr, project_id)) >= since
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _print_summary(rows: list[JsonDict], features: list[FeatureRow]) -> None:
|
|
87
|
+
s = summarize(rows, features)
|
|
88
|
+
typer.echo(
|
|
89
|
+
f" {s.n_labelled} labelled / {s.n_data_rows} rows "
|
|
90
|
+
f"({s.n_reached_unlabelled} reached unlabelled)"
|
|
91
|
+
)
|
|
92
|
+
if s.statuses:
|
|
93
|
+
typer.echo(" status: " + ", ".join(f"{k}={v}" for k, v in sorted(s.statuses.items())))
|
|
94
|
+
if s.feature_kinds:
|
|
95
|
+
typer.echo(" kinds: " + ", ".join(f"{k}={v}" for k, v in sorted(s.feature_kinds.items())))
|
|
96
|
+
if s.latest_created_at:
|
|
97
|
+
typer.echo(f" latest label: {s.latest_created_at}")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__": # pragma: no cover
|
|
101
|
+
app()
|
labelpull/core.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""Ontology-agnostic Labelbox export + flatten.
|
|
2
|
+
|
|
3
|
+
The Labelbox SDK already exports a project's labels and streams them as deeply
|
|
4
|
+
nested, ontology-shaped JSON. What it does *not* give you is a tabular view, the
|
|
5
|
+
correctness logic to pick the right label when a row was reviewed, or a workflow
|
|
6
|
+
status that is always populated. This module is exactly that thin layer:
|
|
7
|
+
|
|
8
|
+
* :func:`export` wraps ``project.export(...)`` + ``wait_till_done()`` +
|
|
9
|
+
``get_buffered_stream()`` (SDK lazy-imported, so it is optional) and adds a
|
|
10
|
+
``since`` filter for "only the latest annotations".
|
|
11
|
+
* :func:`flatten` turns one export row into :class:`FeatureRow` long-format rows,
|
|
12
|
+
covering *every* feature kind (classifications AND objects) without assuming a
|
|
13
|
+
particular ontology. It encodes the two traps a hand-written parser gets wrong:
|
|
14
|
+
selecting the most-recently-created label (a QC-reviewed row carries both the
|
|
15
|
+
annotator's and the reviewer's label) and normalizing the workflow status.
|
|
16
|
+
* :func:`read_export_file` parses a saved export (UI download or a prior pull) so
|
|
17
|
+
the same flattener runs offline, no API key required.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
from collections.abc import Iterable, Iterator
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, cast
|
|
28
|
+
|
|
29
|
+
# One export row (and its nested blocks) is arbitrary JSON; alias it for brevity.
|
|
30
|
+
JsonDict = dict[str, Any]
|
|
31
|
+
|
|
32
|
+
# The task-queue stages ``project.export(filters={"workflow_status": ...})`` accepts.
|
|
33
|
+
WORKFLOW_STATUSES = ("ToLabel", "InReview", "InRework", "Done")
|
|
34
|
+
|
|
35
|
+
# Geometry keys a localized object may carry in the v7 export, in probe order.
|
|
36
|
+
_GEOMETRY_KINDS = ("bounding_box", "polygon", "line", "point", "mask")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class FeatureRow:
|
|
41
|
+
"""One ``(label, feature)`` pair from an export row, ontology-agnostic.
|
|
42
|
+
|
|
43
|
+
A classification answer or a localized object. An object's nested
|
|
44
|
+
classifications become their own rows, linked to the object via
|
|
45
|
+
:attr:`parent_feature_id`. Each labelled data row also yields one
|
|
46
|
+
``feature_kind="label"`` sentinel row (no feature, ``value=""``) so that a
|
|
47
|
+
reached-and-labelled row is always represented even when empty.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
global_key: str
|
|
51
|
+
data_row_id: str
|
|
52
|
+
# one of: label, radio, checklist, text, bounding_box, polygon,
|
|
53
|
+
# line, point, mask, relationship, unknown
|
|
54
|
+
feature_kind: str
|
|
55
|
+
feature_name: str
|
|
56
|
+
value: str # answer value(s) / compact geometry; "" when none
|
|
57
|
+
workflow_status: str | None
|
|
58
|
+
labeled_by: str | None
|
|
59
|
+
created_at: str | None
|
|
60
|
+
parent_feature_id: str # "" for top-level features
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def export(
|
|
64
|
+
project_id: str,
|
|
65
|
+
*,
|
|
66
|
+
status: str | None = None,
|
|
67
|
+
since: str | None = None,
|
|
68
|
+
api_key: str | None = None,
|
|
69
|
+
client: Any | None = None,
|
|
70
|
+
) -> Iterator[JsonDict]:
|
|
71
|
+
"""Stream export rows (one dict per data row) for ``project_id``.
|
|
72
|
+
|
|
73
|
+
``status`` filters by task-queue stage (see :data:`WORKFLOW_STATUSES`).
|
|
74
|
+
``since`` keeps only rows whose newest label was created on/after an ISO
|
|
75
|
+
date/datetime string (lexicographic compare on the ISO timestamp). Pass
|
|
76
|
+
``client`` to inject a stub; otherwise the ``labelbox`` SDK is imported
|
|
77
|
+
lazily and a client is built from ``api_key`` or ``LABELBOX_API_KEY``.
|
|
78
|
+
"""
|
|
79
|
+
cl = client if client is not None else _make_client(api_key)
|
|
80
|
+
project = cl.get_project(project_id)
|
|
81
|
+
filters = {"workflow_status": status} if status else None
|
|
82
|
+
task = project.export(
|
|
83
|
+
params={"data_row_details": True, "label_details": True, "project_details": True},
|
|
84
|
+
filters=filters,
|
|
85
|
+
)
|
|
86
|
+
task.wait_till_done()
|
|
87
|
+
for row in task.get_buffered_stream():
|
|
88
|
+
dr = row.json
|
|
89
|
+
if since is None or _created_at(_select_project(dr, project_id)) >= since:
|
|
90
|
+
yield dr
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def read_export_file(path: str | Path) -> list[JsonDict]:
|
|
94
|
+
"""Load a saved export (JSON array or NDJSON) for offline flattening."""
|
|
95
|
+
text = Path(path).read_text().strip()
|
|
96
|
+
if not text:
|
|
97
|
+
return []
|
|
98
|
+
try:
|
|
99
|
+
loaded = json.loads(text)
|
|
100
|
+
return loaded if isinstance(loaded, list) else [loaded]
|
|
101
|
+
except json.JSONDecodeError:
|
|
102
|
+
return [json.loads(line) for line in text.splitlines() if line.strip()]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def flatten(dr: JsonDict, project_id: str | None = None) -> list[FeatureRow]:
|
|
106
|
+
"""Flatten one export row into :class:`FeatureRow` rows (every feature).
|
|
107
|
+
|
|
108
|
+
``project_id`` selects which project's labels to read; ``None`` uses the only
|
|
109
|
+
project present (the common single-project export) and returns nothing if the
|
|
110
|
+
row is ambiguous (multiple projects) so a caller never silently mixes them.
|
|
111
|
+
An unreached or unlabelled row yields ``[]``.
|
|
112
|
+
"""
|
|
113
|
+
data_row = dr.get("data_row") or {}
|
|
114
|
+
global_key = data_row.get("global_key") or ""
|
|
115
|
+
data_row_id = data_row.get("id") or global_key
|
|
116
|
+
proj = _select_project(dr, project_id)
|
|
117
|
+
label = _latest_label(proj)
|
|
118
|
+
if not global_key or label is None:
|
|
119
|
+
return []
|
|
120
|
+
|
|
121
|
+
status = _workflow_status(proj)
|
|
122
|
+
details = label.get("label_details") or {}
|
|
123
|
+
labeled_by = details.get("created_by")
|
|
124
|
+
created_at = details.get("created_at")
|
|
125
|
+
ann = label.get("annotations") or {}
|
|
126
|
+
rows: list[FeatureRow] = []
|
|
127
|
+
|
|
128
|
+
def emit(kind: str, name: str | None, value: str, parent: str = "") -> None:
|
|
129
|
+
rows.append(
|
|
130
|
+
FeatureRow(
|
|
131
|
+
global_key,
|
|
132
|
+
data_row_id,
|
|
133
|
+
kind,
|
|
134
|
+
name or "",
|
|
135
|
+
value,
|
|
136
|
+
status,
|
|
137
|
+
labeled_by,
|
|
138
|
+
created_at,
|
|
139
|
+
parent,
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Sentinel: this row was reached and labelled (carries who/when even if empty).
|
|
144
|
+
emit("label", "", "")
|
|
145
|
+
|
|
146
|
+
for cls in ann.get("classifications") or []:
|
|
147
|
+
kind, value = _classification_value(cls)
|
|
148
|
+
emit(kind, cls.get("name"), value)
|
|
149
|
+
|
|
150
|
+
for obj in ann.get("objects") or []:
|
|
151
|
+
kind, value = _object_geometry(obj)
|
|
152
|
+
feature_id = obj.get("feature_id") or obj.get("feature_schema_id") or ""
|
|
153
|
+
emit(kind, obj.get("name"), value)
|
|
154
|
+
for cls in obj.get("classifications") or []:
|
|
155
|
+
ckind, cvalue = _classification_value(cls)
|
|
156
|
+
emit(ckind, cls.get("name"), cvalue, parent=feature_id)
|
|
157
|
+
|
|
158
|
+
for rel in ann.get("relationships") or []:
|
|
159
|
+
value = json.dumps(rel.get("relationship") or {}, sort_keys=True)
|
|
160
|
+
emit("relationship", rel.get("name"), value)
|
|
161
|
+
|
|
162
|
+
return rows
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataclass(frozen=True)
|
|
166
|
+
class Summary:
|
|
167
|
+
"""Triage view of a pull: how much came back, of what kind, how fresh."""
|
|
168
|
+
|
|
169
|
+
n_data_rows: int
|
|
170
|
+
n_labelled: int
|
|
171
|
+
n_reached_unlabelled: int
|
|
172
|
+
feature_kinds: dict[str, int]
|
|
173
|
+
feature_names: dict[str, int]
|
|
174
|
+
statuses: dict[str, int]
|
|
175
|
+
latest_created_at: str | None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def summarize(rows: Iterable[JsonDict], features: Iterable[FeatureRow]) -> Summary:
|
|
179
|
+
"""Count data rows, labelled rows, and per-kind/name/status breakdowns."""
|
|
180
|
+
rows = list(rows)
|
|
181
|
+
features = list(features)
|
|
182
|
+
labelled_keys = {f.global_key for f in features}
|
|
183
|
+
kinds: dict[str, int] = {}
|
|
184
|
+
names: dict[str, int] = {}
|
|
185
|
+
statuses: dict[str, int] = {}
|
|
186
|
+
latest: str | None = None
|
|
187
|
+
for f in features:
|
|
188
|
+
if f.feature_kind == "label":
|
|
189
|
+
if f.workflow_status:
|
|
190
|
+
statuses[f.workflow_status] = statuses.get(f.workflow_status, 0) + 1
|
|
191
|
+
if f.created_at and (latest is None or f.created_at > latest):
|
|
192
|
+
latest = f.created_at
|
|
193
|
+
continue
|
|
194
|
+
kinds[f.feature_kind] = kinds.get(f.feature_kind, 0) + 1
|
|
195
|
+
if f.feature_name:
|
|
196
|
+
names[f.feature_name] = names.get(f.feature_name, 0) + 1
|
|
197
|
+
n_data_rows = len(rows)
|
|
198
|
+
n_labelled = len(labelled_keys)
|
|
199
|
+
return Summary(
|
|
200
|
+
n_data_rows=n_data_rows,
|
|
201
|
+
n_labelled=n_labelled,
|
|
202
|
+
n_reached_unlabelled=max(n_data_rows - n_labelled, 0),
|
|
203
|
+
feature_kinds=kinds,
|
|
204
|
+
feature_names=names,
|
|
205
|
+
statuses=statuses,
|
|
206
|
+
latest_created_at=latest,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# --- internals -------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _make_client(api_key: str | None) -> Any:
|
|
214
|
+
try:
|
|
215
|
+
import labelbox as lb # noqa: PLC0415 (optional dep, imported only for live pulls)
|
|
216
|
+
except ImportError as exc: # pragma: no cover - exercised only without the SDK
|
|
217
|
+
raise RuntimeError(
|
|
218
|
+
"a live pull needs the Labelbox SDK: pip install 'labelpull[live]'"
|
|
219
|
+
) from exc
|
|
220
|
+
key = api_key or os.environ.get("LABELBOX_API_KEY")
|
|
221
|
+
if not key:
|
|
222
|
+
raise RuntimeError(
|
|
223
|
+
"no Labelbox API key: pass api_key=... or set LABELBOX_API_KEY "
|
|
224
|
+
"(or use a saved export with read_export_file)"
|
|
225
|
+
)
|
|
226
|
+
return lb.Client(api_key=key)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _select_project(dr: JsonDict, project_id: str | None) -> JsonDict:
|
|
230
|
+
projects = dr.get("projects") or {}
|
|
231
|
+
if project_id is not None:
|
|
232
|
+
return projects.get(project_id) or {}
|
|
233
|
+
if len(projects) == 1:
|
|
234
|
+
return next(iter(projects.values()))
|
|
235
|
+
return {} # ambiguous: caller must name the project
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _latest_label(proj: JsonDict) -> JsonDict | None:
|
|
239
|
+
# A QC-reviewed row carries the annotator's label *and* the reviewer's; the
|
|
240
|
+
# verified answer is the most recently created, not labels[0].
|
|
241
|
+
labels = proj.get("labels") or []
|
|
242
|
+
if not labels:
|
|
243
|
+
return None
|
|
244
|
+
return cast("JsonDict", max(labels, key=_created_at_of_label))
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _created_at_of_label(label: JsonDict) -> str:
|
|
248
|
+
return (label.get("label_details") or {}).get("created_at") or ""
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _created_at(proj: JsonDict) -> str:
|
|
252
|
+
label = _latest_label(proj)
|
|
253
|
+
return _created_at_of_label(label) if label else ""
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _workflow_status(proj: JsonDict) -> str | None:
|
|
257
|
+
details = proj.get("project_details") or {}
|
|
258
|
+
status = details.get("workflow_status")
|
|
259
|
+
if status is None:
|
|
260
|
+
queue = details.get("task_queue_name") or details.get("task_queue_status")
|
|
261
|
+
status = "Done" if queue == "Done" else queue
|
|
262
|
+
return status
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _classification_value(cls: JsonDict) -> tuple[str, str]:
|
|
266
|
+
if cls.get("radio_answer"):
|
|
267
|
+
answer = cls["radio_answer"]
|
|
268
|
+
return "radio", answer.get("value") or answer.get("name") or ""
|
|
269
|
+
if cls.get("checklist_answers") is not None:
|
|
270
|
+
values = [a.get("value") or a.get("name") or "" for a in cls["checklist_answers"]]
|
|
271
|
+
return "checklist", ";".join(v for v in values if v)
|
|
272
|
+
if cls.get("text_answer") is not None:
|
|
273
|
+
return "text", (cls["text_answer"] or {}).get("content") or ""
|
|
274
|
+
return "unknown", ""
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _object_geometry(obj: JsonDict) -> tuple[str, str]:
|
|
278
|
+
for kind in _GEOMETRY_KINDS:
|
|
279
|
+
geom = obj.get(kind)
|
|
280
|
+
if geom is None:
|
|
281
|
+
continue
|
|
282
|
+
if kind == "mask":
|
|
283
|
+
return "mask", (geom or {}).get("url") or ""
|
|
284
|
+
return kind, json.dumps(geom, sort_keys=True)
|
|
285
|
+
return "unknown", ""
|
labelpull/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: labelpull
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
|
|
5
|
+
Author-email: Wietze Suijker <wietze.suijker@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: typer>=0.9
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
11
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
12
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
13
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
14
|
+
Provides-Extra: live
|
|
15
|
+
Requires-Dist: labelbox>=7.0; extra == 'live'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# labelpull
|
|
19
|
+
|
|
20
|
+
Pull the latest Labelbox annotations into a tidy, ontology-agnostic table.
|
|
21
|
+
|
|
22
|
+
The Labelbox SDK already exports a project's labels and streams them. What it
|
|
23
|
+
doesn't give you is a *tabular* view of that deeply nested JSON, the correctness
|
|
24
|
+
logic to pick the right label when a row was reviewed, or a workflow status that
|
|
25
|
+
is always populated. `labelpull` is exactly that thin layer on top of the SDK.
|
|
26
|
+
|
|
27
|
+
## Install
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install labelpull # offline parsing + CLI
|
|
31
|
+
pip install 'labelpull[live]' # + the Labelbox SDK for live pulls
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## CLI
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
export LABELBOX_API_KEY=...
|
|
38
|
+
labelpull pull <PROJECT_ID> -o labels.csv # generic long CSV (any ontology)
|
|
39
|
+
labelpull pull <PROJECT_ID> --status Done # only verified rows
|
|
40
|
+
labelpull pull <PROJECT_ID> --since 2026-06-01 # only the latest labels
|
|
41
|
+
labelpull pull <PROJECT_ID> --from-export export.ndjson # offline, no API key
|
|
42
|
+
labelpull pull <PROJECT_ID> --schema species -o taxa.csv # speciesfirst Taxon/Organs wide CSV
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
`--schema generic` (default) writes one row per feature — every classification
|
|
46
|
+
and object, any ontology:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
global_key,data_row_id,feature_kind,feature_name,value,workflow_status,labeled_by,created_at,parent_feature_id
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Library
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import labelpull
|
|
56
|
+
|
|
57
|
+
rows = list(labelpull.export("proj_id", status="Done")) # or read_export_file("export.ndjson")
|
|
58
|
+
features = [f for r in rows for f in labelpull.flatten(r, "proj_id")]
|
|
59
|
+
labelpull.write_csv("labels.csv", labelpull.GenericAdapter(), features)
|
|
60
|
+
print(labelpull.summarize(rows, features))
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
`flatten()` handles radio / checklist / text classifications and bbox / polygon /
|
|
64
|
+
line / point / mask objects (with nested classifications linked to their parent),
|
|
65
|
+
and always selects the most recently created label so a QC-reviewed row reports
|
|
66
|
+
the reviewer's answer, not the annotator's.
|
|
67
|
+
|
|
68
|
+
Write your own `Adapter` to collapse features into a project-specific wide table;
|
|
69
|
+
`SpeciesAdapter` is the reference implementation.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
labelpull/__init__.py,sha256=J8P7ntqvXjksIW6zE4l2nJf8w3As9B5_ekxdIT6NvvA,929
|
|
2
|
+
labelpull/adapters.py,sha256=5nIpl7NI13BfkQtn7eQ_eyD7maOmCkePXk7F8V9qHOc,3834
|
|
3
|
+
labelpull/cli.py,sha256=042bOraQpPAaRtATC-I0UUqKf1I8DR1ASb0opT21ZjQ,3536
|
|
4
|
+
labelpull/core.py,sha256=VGPRMcd2pQLo16KGuHKWGnJ2iyoUxWCq5Y53DdLIXtM,10765
|
|
5
|
+
labelpull/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
labelpull-0.1.0.dist-info/METADATA,sha256=H2Onwo9BL_7J-3gKtgpd8JKl8DyKh3ff0FlTmaf8kwI,2572
|
|
7
|
+
labelpull-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
8
|
+
labelpull-0.1.0.dist-info/entry_points.txt,sha256=vCh16Czaiyg87c3851QSoQTVIjIV55vK0vQ6xWZEbZI,48
|
|
9
|
+
labelpull-0.1.0.dist-info/RECORD,,
|