quiclabel-coco-sync 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quiclabel_coco_sync-0.0.1.dist-info/METADATA +145 -0
- quiclabel_coco_sync-0.0.1.dist-info/RECORD +11 -0
- quiclabel_coco_sync-0.0.1.dist-info/WHEEL +4 -0
- quiclabel_coco_sync-0.0.1.dist-info/entry_points.txt +3 -0
- quiclabel_coco_sync-0.0.1.dist-info/licenses/LICENSE +21 -0
- quiclabel_sync_project_coco/__init__.py +3 -0
- quiclabel_sync_project_coco/api.py +97 -0
- quiclabel_sync_project_coco/cli.py +196 -0
- quiclabel_sync_project_coco/config.py +110 -0
- quiclabel_sync_project_coco/downloader.py +139 -0
- quiclabel_sync_project_coco/writer.py +75 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quiclabel-coco-sync
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: CLI to incrementally sync a QuicLabel COCO dataset (annotations + images) from quiclabel-admin
|
|
5
|
+
Project-URL: Homepage, https://github.com/weavejam/quiclabel/tree/main/apps/quiclabel-sync-project-coco
|
|
6
|
+
Project-URL: Repository, https://github.com/weavejam/quiclabel
|
|
7
|
+
Project-URL: Issues, https://github.com/weavejam/quiclabel/issues
|
|
8
|
+
Author: weavejam / quiclabel contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: annotation,coco,computer-vision,dataset,quiclabel,sync
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
24
|
+
Classifier: Topic :: Utilities
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Requires-Dist: click>=8.1
|
|
27
|
+
Requires-Dist: requests>=2.31
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# quiclabel-coco-sync
|
|
31
|
+
|
|
32
|
+
CLI to incrementally sync a QuicLabel COCO dataset (annotations + images)
|
|
33
|
+
from `quiclabel-admin`. Pulls a fresh `annotations-YYYYMMDD-HHMMSS.json`
|
|
34
|
+
next to your existing dataset and multi-threadedly downloads only the
|
|
35
|
+
images you don't already have.
|
|
36
|
+
|
|
37
|
+
## Prerequisites
|
|
38
|
+
|
|
39
|
+
- **uv** — Python package & runtime manager. Install:
|
|
40
|
+
- macOS / Linux: `curl -LsSf https://astral.sh/uv/install.sh | sh`
|
|
41
|
+
- Windows: `winget install astral-sh.uv` (or `irm https://astral.sh/uv/install.ps1 | iex`)
|
|
42
|
+
- via pipx: `pipx install uv`
|
|
43
|
+
- **An API key** — get one from quiclabel-admin: *Settings → API Keys → New key*.
|
|
44
|
+
Copy the `qk_...` value immediately (it's only shown once).
|
|
45
|
+
|
|
46
|
+
## Quick start (from PyPI — recommended)
|
|
47
|
+
|
|
48
|
+
No clone, no install — `uvx` downloads, caches and runs in one shot:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uvx quiclabel-coco-sync path/to/annotations.json \
|
|
52
|
+
--admin-url https://quiclabel-admin.example.com \
|
|
53
|
+
--api-key qk_xxxxxxxxxxxxxxxxxxxxxx
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Or set env vars and call it bare:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
export QUICLABEL_ADMIN_URL=https://quiclabel-admin.example.com
|
|
60
|
+
export QUICLABEL_API_KEY=qk_xxxxxxxxxxxxxxxxxxxxxx
|
|
61
|
+
uvx quiclabel-coco-sync path/to/annotations.json
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Prefer a persistent install? Use `uv tool`:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
uv tool install quiclabel-coco-sync
|
|
68
|
+
quiclabel-coco-sync path/to/annotations.json --admin-url ... --api-key ...
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## From the monorepo (contributors)
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# From the repo root
|
|
75
|
+
pnpm sync-project-coco path/to/annotations.json \
|
|
76
|
+
--admin-url https://quiclabel-admin.example.com \
|
|
77
|
+
--api-key qk_xxxxxxxxxxxxxxxxxxxxxx
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Or directly with `uv` against this app directory:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
cd apps/quiclabel-sync-project-coco
|
|
84
|
+
uv sync
|
|
85
|
+
uv run quiclabel-coco-sync path/to/annotations.json \
|
|
86
|
+
--admin-url https://quiclabel-admin.example.com \
|
|
87
|
+
--api-key qk_xxxxxxxxxxxxxxxxxxxxxx
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## What it does
|
|
91
|
+
|
|
92
|
+
1. Reads `path/to/annotations.json` and its `meta` block (added by the COCO exporter).
|
|
93
|
+
2. Calls `GET /api/v1/projects/<project_id>/coco` with the same filters,
|
|
94
|
+
paging by cursor — so 10k+ task projects don't blow up server memory.
|
|
95
|
+
3. Writes `path/to/annotations-20260519-143045.json` (timestamped — never
|
|
96
|
+
overwrites your input).
|
|
97
|
+
4. Diffs `task_id` sets, downloads any missing images to `path/to/images/`
|
|
98
|
+
using a thread pool. Files already on disk are skipped by file name.
|
|
99
|
+
|
|
100
|
+
The old `annotations.json` and the existing `images/*` files are never touched.
|
|
101
|
+
|
|
102
|
+
## Configuration priority
|
|
103
|
+
|
|
104
|
+
Each value is resolved in this order — first wins:
|
|
105
|
+
|
|
106
|
+
1. CLI flag (`--project-id`, `--statuses`, …)
|
|
107
|
+
2. Env var (`QUICLABEL_ADMIN_URL`, `QUICLABEL_API_KEY`)
|
|
108
|
+
3. `meta` block of the input json
|
|
109
|
+
|
|
110
|
+
If anything required is missing from all three, the CLI exits with a clear
|
|
111
|
+
message naming the missing key and where to provide it.
|
|
112
|
+
|
|
113
|
+
## Recovery
|
|
114
|
+
|
|
115
|
+
- **Partial failure** (some images failed mid-run): just re-run the same
|
|
116
|
+
command. Already-downloaded files are skipped by file name, so retry only
|
|
117
|
+
fetches the remaining ones. The CLI tells you this in the failure summary.
|
|
118
|
+
- **Corrupt image file**: delete it, then re-run.
|
|
119
|
+
- **A `.part` file in `images/`** indicates a crashed download. Safe to delete.
|
|
120
|
+
|
|
121
|
+
## Development
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
cd apps/quiclabel-sync-project-coco
|
|
125
|
+
uv sync --group dev
|
|
126
|
+
uv run pytest
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Releasing to PyPI (maintainers)
|
|
130
|
+
|
|
131
|
+
Manual release flow until CI is wired up:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
cd apps/quiclabel-sync-project-coco
|
|
135
|
+
|
|
136
|
+
# 1. bump version in pyproject.toml
|
|
137
|
+
# 2. build sdist + wheel
|
|
138
|
+
uv build
|
|
139
|
+
|
|
140
|
+
# 3. publish (uses UV_PUBLISH_TOKEN or prompts)
|
|
141
|
+
uv publish
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Get a PyPI API token at <https://pypi.org/manage/account/token/>.
|
|
145
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
quiclabel_sync_project_coco/__init__.py,sha256=iXXcTrgwIndNlaUy8L-UzoOSThTjKiVbr4JjWAuTz2s,78
|
|
2
|
+
quiclabel_sync_project_coco/api.py,sha256=ROx2QwzGdeYkBa5n5XiNNtPQUH7kcxYx7NnAyp_mfro,3004
|
|
3
|
+
quiclabel_sync_project_coco/cli.py,sha256=h0FtdFGoXsfRfqnops5v8CzACrIfHYcOTnoLYMNuCVU,6020
|
|
4
|
+
quiclabel_sync_project_coco/config.py,sha256=s3x_926IRQl2n4P1ZHjeCB50o82iRthCTUhQQIkiOZ0,3562
|
|
5
|
+
quiclabel_sync_project_coco/downloader.py,sha256=uDL2HCQqY9N09JBxBbLXA4Dxmw9StWsIyFKfVGTfYr0,4187
|
|
6
|
+
quiclabel_sync_project_coco/writer.py,sha256=h7t8tJ9JRYstU8HxygRiMhDlsG3GCFAppk-JyhQv_50,2631
|
|
7
|
+
quiclabel_coco_sync-0.0.1.dist-info/METADATA,sha256=B2TfJ6-iw5WsQux6DrR_WIi44KhEhfnvYWMi2LBwqrU,4832
|
|
8
|
+
quiclabel_coco_sync-0.0.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
9
|
+
quiclabel_coco_sync-0.0.1.dist-info/entry_points.txt,sha256=VoZ2CGiLkpPFsIDwks4LY0N5XZNpbkCt4ZXwyjNXGq0,134
|
|
10
|
+
quiclabel_coco_sync-0.0.1.dist-info/licenses/LICENSE,sha256=81sEC8BzWYWf93bVcTyfCnrosDbvZ0NXjPHXQOUbYNc,1111
|
|
11
|
+
quiclabel_coco_sync-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 weavejam / quiclabel contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""HTTP client for GET /api/v1/projects/:id/coco with cursor pagination."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any, Iterator
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
RETRYABLE_STATUS = {429, 500, 502, 503, 504}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ApiError(Exception):
|
|
16
|
+
"""Raised for non-retryable HTTP failures (4xx other than 429)."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _request_page(
|
|
20
|
+
session: requests.Session,
|
|
21
|
+
url: str,
|
|
22
|
+
params: dict[str, Any],
|
|
23
|
+
headers: dict[str, str],
|
|
24
|
+
*,
|
|
25
|
+
max_retries: int = 3,
|
|
26
|
+
timeout: float = 30.0,
|
|
27
|
+
) -> dict[str, Any]:
|
|
28
|
+
"""Fetch one page with retry-on-5xx. Raises ApiError for 4xx."""
|
|
29
|
+
for attempt in range(max_retries + 1):
|
|
30
|
+
try:
|
|
31
|
+
resp = session.get(url, params=params, headers=headers, timeout=timeout)
|
|
32
|
+
except requests.RequestException as e:
|
|
33
|
+
if attempt == max_retries:
|
|
34
|
+
raise ApiError(f"Network error after {max_retries + 1} attempts: {e}") from e
|
|
35
|
+
logger.warning("network error (attempt %d): %s", attempt + 1, e)
|
|
36
|
+
time.sleep(2**attempt)
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
if resp.status_code == 200:
|
|
40
|
+
return resp.json()
|
|
41
|
+
|
|
42
|
+
if resp.status_code in (401, 403):
|
|
43
|
+
raise ApiError(
|
|
44
|
+
f"Authentication failed ({resp.status_code}): check --api-key"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if resp.status_code in RETRYABLE_STATUS and attempt < max_retries:
|
|
48
|
+
logger.warning(
|
|
49
|
+
"server %d (attempt %d), retrying", resp.status_code, attempt + 1
|
|
50
|
+
)
|
|
51
|
+
time.sleep(2**attempt)
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
raise ApiError(f"API error {resp.status_code}: {resp.text[:500]}")
|
|
55
|
+
|
|
56
|
+
raise ApiError("retry loop exhausted") # pragma: no cover
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def iter_pages(
|
|
60
|
+
admin_url: str,
|
|
61
|
+
api_key: str,
|
|
62
|
+
project_id: str,
|
|
63
|
+
*,
|
|
64
|
+
statuses: list[str],
|
|
65
|
+
tag_ids: list[str],
|
|
66
|
+
image_source: str,
|
|
67
|
+
limit: int = 500,
|
|
68
|
+
session: requests.Session | None = None,
|
|
69
|
+
) -> Iterator[dict[str, Any]]:
|
|
70
|
+
"""Yield each page dict in turn. The first page has meta/info/categories;
|
|
71
|
+
subsequent pages only have images/annotations/next_cursor."""
|
|
72
|
+
s = session or requests.Session()
|
|
73
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
74
|
+
url = f"{admin_url}/api/v1/projects/{project_id}/coco"
|
|
75
|
+
params: dict[str, Any] = {
|
|
76
|
+
"statuses": ",".join(statuses),
|
|
77
|
+
"image_source": image_source,
|
|
78
|
+
"limit": limit,
|
|
79
|
+
}
|
|
80
|
+
if tag_ids:
|
|
81
|
+
params["tag_ids"] = ",".join(tag_ids)
|
|
82
|
+
|
|
83
|
+
cursor: str | None = None
|
|
84
|
+
page_num = 0
|
|
85
|
+
while True:
|
|
86
|
+
page_params = dict(params)
|
|
87
|
+
if cursor:
|
|
88
|
+
page_params["cursor"] = cursor
|
|
89
|
+
|
|
90
|
+
page = _request_page(s, url, page_params, headers)
|
|
91
|
+
page_num += 1
|
|
92
|
+
logger.info("page %d: %d images", page_num, len(page.get("images") or []))
|
|
93
|
+
yield page
|
|
94
|
+
|
|
95
|
+
cursor = page.get("next_cursor")
|
|
96
|
+
if not cursor:
|
|
97
|
+
return
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""sync-project-coco CLI entrypoint."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from .api import ApiError, iter_pages
|
|
12
|
+
from .config import ConfigError, resolve_config
|
|
13
|
+
from .downloader import DownloadJob, download_all
|
|
14
|
+
from .writer import timestamped_filename, write_dataset
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _setup_logging(verbose: bool) -> None:
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.DEBUG if verbose else logging.INFO,
|
|
20
|
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
21
|
+
datefmt="%H:%M:%S",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
|
26
|
+
@click.argument(
|
|
27
|
+
"input_path",
|
|
28
|
+
type=click.Path(exists=True, dir_okay=False, path_type=Path),
|
|
29
|
+
)
|
|
30
|
+
@click.option("--admin-url", envvar="QUICLABEL_ADMIN_URL", help="Admin base URL")
|
|
31
|
+
@click.option("--api-key", envvar="QUICLABEL_API_KEY", help="API key (qk_...)")
|
|
32
|
+
@click.option("--project-id", help="Overrides meta.project_id")
|
|
33
|
+
@click.option("--statuses", help="Comma-separated, overrides meta.filters.statuses")
|
|
34
|
+
@click.option("--tag-ids", help="Comma-separated, overrides meta.filters.tag_ids")
|
|
35
|
+
@click.option(
|
|
36
|
+
"--image-source",
|
|
37
|
+
type=click.Choice(["compressed", "original"]),
|
|
38
|
+
help="Overrides meta.image_source",
|
|
39
|
+
)
|
|
40
|
+
@click.option(
|
|
41
|
+
"--concurrency",
|
|
42
|
+
type=click.IntRange(1, 128),
|
|
43
|
+
default=16,
|
|
44
|
+
show_default=True,
|
|
45
|
+
help="Image download threads",
|
|
46
|
+
)
|
|
47
|
+
@click.option(
|
|
48
|
+
"--limit",
|
|
49
|
+
type=click.IntRange(1, 1000),
|
|
50
|
+
default=500,
|
|
51
|
+
show_default=True,
|
|
52
|
+
help="Per-page limit",
|
|
53
|
+
)
|
|
54
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
|
|
55
|
+
def main(
|
|
56
|
+
input_path: Path,
|
|
57
|
+
admin_url: str | None,
|
|
58
|
+
api_key: str | None,
|
|
59
|
+
project_id: str | None,
|
|
60
|
+
statuses: str | None,
|
|
61
|
+
tag_ids: str | None,
|
|
62
|
+
image_source: str | None,
|
|
63
|
+
concurrency: int,
|
|
64
|
+
limit: int,
|
|
65
|
+
verbose: bool,
|
|
66
|
+
) -> None:
|
|
67
|
+
"""Sync a QuicLabel COCO dataset: pulls a fresh annotations json (named
|
|
68
|
+
``annotations-YYYYMMDD-HHMMSS.json``) and downloads any missing images
|
|
69
|
+
next to it. The input ``annotations.json`` and existing images are never
|
|
70
|
+
modified.
|
|
71
|
+
|
|
72
|
+
Config priority: CLI flag > env var > meta block in the input json.
|
|
73
|
+
"""
|
|
74
|
+
_setup_logging(verbose)
|
|
75
|
+
log = logging.getLogger("sync-project-coco")
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
input_doc = json.loads(input_path.read_text(encoding="utf-8"))
|
|
79
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
80
|
+
click.echo(f"failed to read {input_path}: {e}", err=True)
|
|
81
|
+
sys.exit(2)
|
|
82
|
+
if not isinstance(input_doc, dict):
|
|
83
|
+
click.echo(f"input {input_path} is not a JSON object", err=True)
|
|
84
|
+
sys.exit(2)
|
|
85
|
+
|
|
86
|
+
input_meta = input_doc.get("meta")
|
|
87
|
+
if input_meta is not None and not isinstance(input_meta, dict):
|
|
88
|
+
log.warning("meta in input is not an object — ignoring")
|
|
89
|
+
input_meta = None
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
cfg = resolve_config(
|
|
93
|
+
input_meta=input_meta,
|
|
94
|
+
cli_admin_url=admin_url,
|
|
95
|
+
cli_api_key=api_key,
|
|
96
|
+
cli_project_id=project_id,
|
|
97
|
+
cli_statuses=statuses,
|
|
98
|
+
cli_tag_ids=tag_ids,
|
|
99
|
+
cli_image_source=image_source,
|
|
100
|
+
cli_concurrency=concurrency,
|
|
101
|
+
)
|
|
102
|
+
except ConfigError as e:
|
|
103
|
+
click.echo(str(e), err=True)
|
|
104
|
+
sys.exit(2)
|
|
105
|
+
|
|
106
|
+
log.info(
|
|
107
|
+
"project=%s statuses=%s tag_ids=%s image_source=%s",
|
|
108
|
+
cfg.project_id,
|
|
109
|
+
cfg.statuses,
|
|
110
|
+
cfg.tag_ids,
|
|
111
|
+
cfg.image_source,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
out_dir = input_path.parent
|
|
115
|
+
new_annotations_path = out_dir / timestamped_filename()
|
|
116
|
+
images_dir = out_dir / "images"
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
pages = iter_pages(
|
|
120
|
+
cfg.admin_url,
|
|
121
|
+
cfg.api_key,
|
|
122
|
+
cfg.project_id,
|
|
123
|
+
statuses=cfg.statuses,
|
|
124
|
+
tag_ids=cfg.tag_ids,
|
|
125
|
+
image_source=cfg.image_source,
|
|
126
|
+
limit=limit,
|
|
127
|
+
)
|
|
128
|
+
dataset = write_dataset(pages, new_annotations_path)
|
|
129
|
+
except ApiError as e:
|
|
130
|
+
click.echo(f"API error: {e}", err=True)
|
|
131
|
+
sys.exit(1)
|
|
132
|
+
|
|
133
|
+
log.info(
|
|
134
|
+
"wrote %s (%d images, %d annotations)",
|
|
135
|
+
new_annotations_path.name,
|
|
136
|
+
len(dataset["images"]),
|
|
137
|
+
len(dataset["annotations"]),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Diff against the input doc directly (already parsed, no re-read).
|
|
141
|
+
old_ids = {
|
|
142
|
+
img["task_id"]
|
|
143
|
+
for img in (input_doc.get("images") or [])
|
|
144
|
+
if img.get("task_id")
|
|
145
|
+
}
|
|
146
|
+
new_images = [
|
|
147
|
+
img for img in dataset["images"] if img.get("task_id") not in old_ids
|
|
148
|
+
]
|
|
149
|
+
log.info("new task_ids since input: %d", len(new_images))
|
|
150
|
+
|
|
151
|
+
jobs = [
|
|
152
|
+
DownloadJob(file_name=img["file_name"], url=img.get("url", ""))
|
|
153
|
+
for img in new_images
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
if jobs:
|
|
157
|
+
with click.progressbar(
|
|
158
|
+
length=len(jobs),
|
|
159
|
+
label="downloading",
|
|
160
|
+
file=sys.stderr,
|
|
161
|
+
) as bar:
|
|
162
|
+
results = download_all(
|
|
163
|
+
jobs,
|
|
164
|
+
images_dir,
|
|
165
|
+
concurrency=cfg.concurrency,
|
|
166
|
+
progress=lambda done, _total: bar.update(1),
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
results = []
|
|
170
|
+
|
|
171
|
+
downloaded = sum(1 for r in results if r.ok and not r.skipped)
|
|
172
|
+
skipped = sum(1 for r in results if r.skipped)
|
|
173
|
+
failed = [r for r in results if not r.ok]
|
|
174
|
+
|
|
175
|
+
log.info(
|
|
176
|
+
"done — downloaded=%d skipped=%d failed=%d",
|
|
177
|
+
downloaded,
|
|
178
|
+
skipped,
|
|
179
|
+
len(failed),
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if failed:
|
|
183
|
+
click.echo("Failed downloads:", err=True)
|
|
184
|
+
for r in failed[:20]:
|
|
185
|
+
click.echo(f" {r.file_name}: {r.error}", err=True)
|
|
186
|
+
if len(failed) > 20:
|
|
187
|
+
click.echo(f" ... and {len(failed) - 20} more", err=True)
|
|
188
|
+
click.echo(
|
|
189
|
+
"Re-run the same command to retry — already-downloaded files are skipped.",
|
|
190
|
+
err=True,
|
|
191
|
+
)
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__": # pragma: no cover
|
|
196
|
+
main()
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Resolve sync configuration from CLI flags, env vars, and the input json's
|
|
2
|
+
``meta`` block. Priority: CLI > env > json.meta.
|
|
3
|
+
|
|
4
|
+
All required parameters must come from somewhere — missing config raises
|
|
5
|
+
``ConfigError`` with a message explaining which key is missing and where the
|
|
6
|
+
user can provide it.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ConfigError(Exception):
|
|
16
|
+
"""Raised when a required configuration value cannot be resolved."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class SyncConfig:
|
|
21
|
+
admin_url: str
|
|
22
|
+
api_key: str
|
|
23
|
+
project_id: str
|
|
24
|
+
statuses: list[str]
|
|
25
|
+
tag_ids: list[str]
|
|
26
|
+
image_source: str
|
|
27
|
+
concurrency: int
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _csv(value: str | None) -> list[str] | None:
|
|
31
|
+
"""Split comma-separated string into list, returning None when value is
|
|
32
|
+
None so callers can distinguish "user didn't pass" from "user passed empty"."""
|
|
33
|
+
if value is None:
|
|
34
|
+
return None
|
|
35
|
+
return [s.strip() for s in value.split(",") if s.strip()]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def resolve_config(
|
|
39
|
+
*,
|
|
40
|
+
input_meta: dict[str, Any] | None,
|
|
41
|
+
cli_admin_url: str | None,
|
|
42
|
+
cli_api_key: str | None,
|
|
43
|
+
cli_project_id: str | None,
|
|
44
|
+
cli_statuses: str | None,
|
|
45
|
+
cli_tag_ids: str | None,
|
|
46
|
+
cli_image_source: str | None,
|
|
47
|
+
cli_concurrency: int,
|
|
48
|
+
env: dict[str, str] | None = None,
|
|
49
|
+
) -> SyncConfig:
|
|
50
|
+
"""Resolve all sync parameters. ``input_meta`` is the ``meta`` block from
|
|
51
|
+
the input annotations.json (may be None for legacy exports)."""
|
|
52
|
+
env = env if env is not None else os.environ
|
|
53
|
+
meta = input_meta or {}
|
|
54
|
+
filters = meta.get("filters") or {}
|
|
55
|
+
|
|
56
|
+
admin_url = cli_admin_url or env.get("QUICLABEL_ADMIN_URL")
|
|
57
|
+
api_key = cli_api_key or env.get("QUICLABEL_API_KEY")
|
|
58
|
+
project_id = cli_project_id or meta.get("project_id")
|
|
59
|
+
|
|
60
|
+
cli_statuses_list = _csv(cli_statuses)
|
|
61
|
+
statuses = (
|
|
62
|
+
cli_statuses_list
|
|
63
|
+
if cli_statuses_list is not None
|
|
64
|
+
else (filters.get("statuses") or None)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
cli_tag_ids_list = _csv(cli_tag_ids)
|
|
68
|
+
tag_ids = (
|
|
69
|
+
cli_tag_ids_list
|
|
70
|
+
if cli_tag_ids_list is not None
|
|
71
|
+
else (filters.get("tag_ids") or [])
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
image_source = (
|
|
75
|
+
cli_image_source or meta.get("image_source") or None
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
missing: list[tuple[str, str]] = []
|
|
79
|
+
if not admin_url:
|
|
80
|
+
missing.append(("admin_url", "--admin-url or QUICLABEL_ADMIN_URL"))
|
|
81
|
+
if not api_key:
|
|
82
|
+
missing.append(("api_key", "--api-key or QUICLABEL_API_KEY"))
|
|
83
|
+
if not project_id:
|
|
84
|
+
missing.append(
|
|
85
|
+
("project_id", "input json meta.project_id or --project-id")
|
|
86
|
+
)
|
|
87
|
+
if statuses is None:
|
|
88
|
+
missing.append(
|
|
89
|
+
("statuses", "input json meta.filters.statuses or --statuses")
|
|
90
|
+
)
|
|
91
|
+
if image_source is None:
|
|
92
|
+
missing.append(
|
|
93
|
+
("image_source", "input json meta.image_source or --image-source")
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if missing:
|
|
97
|
+
lines = [f" - {key}: provide via {src}" for key, src in missing]
|
|
98
|
+
raise ConfigError(
|
|
99
|
+
"Missing required configuration:\n" + "\n".join(lines)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return SyncConfig(
|
|
103
|
+
admin_url=admin_url.rstrip("/"), # type: ignore[union-attr]
|
|
104
|
+
api_key=api_key, # type: ignore[arg-type]
|
|
105
|
+
project_id=project_id, # type: ignore[arg-type]
|
|
106
|
+
statuses=statuses, # type: ignore[arg-type]
|
|
107
|
+
tag_ids=tag_ids,
|
|
108
|
+
image_source=image_source, # type: ignore[arg-type]
|
|
109
|
+
concurrency=cli_concurrency,
|
|
110
|
+
)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Parallel image downloader for the sync CLI.
|
|
2
|
+
|
|
3
|
+
Threads (not asyncio) since the bottleneck is network I/O and requests +
|
|
4
|
+
Azure Blob's CDN handle connection reuse just fine within a thread pool.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Iterable
|
|
15
|
+
|
|
16
|
+
import requests
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
RETRYABLE_STATUS = {429, 500, 502, 503, 504}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class DownloadJob:
|
|
25
|
+
file_name: str
|
|
26
|
+
url: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class DownloadResult:
|
|
31
|
+
file_name: str
|
|
32
|
+
ok: bool
|
|
33
|
+
skipped: bool = False
|
|
34
|
+
error: str | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _stream_to_file(response, dest: Path) -> None:
|
|
38
|
+
tmp = dest.with_suffix(dest.suffix + ".part")
|
|
39
|
+
try:
|
|
40
|
+
with open(tmp, "wb") as f:
|
|
41
|
+
for chunk in response.iter_content(chunk_size=1 << 16):
|
|
42
|
+
if chunk:
|
|
43
|
+
f.write(chunk)
|
|
44
|
+
tmp.replace(dest)
|
|
45
|
+
except BaseException:
|
|
46
|
+
if tmp.exists():
|
|
47
|
+
try:
|
|
48
|
+
tmp.unlink()
|
|
49
|
+
except OSError:
|
|
50
|
+
pass
|
|
51
|
+
raise
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _download_one(
|
|
55
|
+
job: DownloadJob,
|
|
56
|
+
images_dir: Path,
|
|
57
|
+
session: requests.Session,
|
|
58
|
+
timeout: float,
|
|
59
|
+
max_retries: int,
|
|
60
|
+
) -> DownloadResult:
|
|
61
|
+
target = images_dir / job.file_name
|
|
62
|
+
if target.exists():
|
|
63
|
+
return DownloadResult(file_name=job.file_name, ok=True, skipped=True)
|
|
64
|
+
|
|
65
|
+
if not job.url:
|
|
66
|
+
return DownloadResult(file_name=job.file_name, ok=False, error="no url")
|
|
67
|
+
|
|
68
|
+
last_err: str | None = None
|
|
69
|
+
for attempt in range(max_retries + 1):
|
|
70
|
+
try:
|
|
71
|
+
with session.get(job.url, stream=True, timeout=timeout) as r:
|
|
72
|
+
if r.status_code == 200:
|
|
73
|
+
_stream_to_file(r, target)
|
|
74
|
+
return DownloadResult(file_name=job.file_name, ok=True)
|
|
75
|
+
if r.status_code in RETRYABLE_STATUS and attempt < max_retries:
|
|
76
|
+
last_err = f"HTTP {r.status_code}"
|
|
77
|
+
time.sleep(2**attempt)
|
|
78
|
+
continue
|
|
79
|
+
last_err = f"HTTP {r.status_code}"
|
|
80
|
+
break
|
|
81
|
+
except requests.RequestException as e:
|
|
82
|
+
last_err = str(e)[:200]
|
|
83
|
+
if attempt < max_retries:
|
|
84
|
+
time.sleep(2**attempt)
|
|
85
|
+
continue
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
return DownloadResult(file_name=job.file_name, ok=False, error=last_err)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def download_all(
|
|
92
|
+
jobs: Iterable[DownloadJob],
|
|
93
|
+
images_dir: Path,
|
|
94
|
+
*,
|
|
95
|
+
concurrency: int = 16,
|
|
96
|
+
timeout: float = 60.0,
|
|
97
|
+
max_retries: int = 3,
|
|
98
|
+
session_factory=None,
|
|
99
|
+
progress=None,
|
|
100
|
+
) -> list[DownloadResult]:
|
|
101
|
+
"""Download jobs in parallel. Returns one result per job. Failures don't
|
|
102
|
+
abort the run — the caller decides what to do with the failure list.
|
|
103
|
+
|
|
104
|
+
Each worker thread gets its own ``requests.Session`` via threading.local —
|
|
105
|
+
Session is not documented thread-safe for cookie/auth state mutation.
|
|
106
|
+
|
|
107
|
+
``progress(done, total)`` callback invoked after each result, if supplied.
|
|
108
|
+
"""
|
|
109
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
110
|
+
job_list = list(jobs)
|
|
111
|
+
results: list[DownloadResult] = []
|
|
112
|
+
|
|
113
|
+
if not job_list:
|
|
114
|
+
return results
|
|
115
|
+
|
|
116
|
+
make_session = session_factory or requests.Session
|
|
117
|
+
local = threading.local()
|
|
118
|
+
|
|
119
|
+
def _per_thread_session() -> requests.Session:
|
|
120
|
+
s = getattr(local, "session", None)
|
|
121
|
+
if s is None:
|
|
122
|
+
s = make_session()
|
|
123
|
+
local.session = s
|
|
124
|
+
return s
|
|
125
|
+
|
|
126
|
+
def _task(job: DownloadJob) -> DownloadResult:
|
|
127
|
+
return _download_one(
|
|
128
|
+
job, images_dir, _per_thread_session(), timeout, max_retries
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
|
132
|
+
futures = {pool.submit(_task, j): j for j in job_list}
|
|
133
|
+
for fut in as_completed(futures):
|
|
134
|
+
res = fut.result()
|
|
135
|
+
results.append(res)
|
|
136
|
+
if progress is not None:
|
|
137
|
+
progress(len(results), len(job_list))
|
|
138
|
+
|
|
139
|
+
return results
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Merge paged COCO API responses into a single annotations JSON file.
|
|
2
|
+
|
|
3
|
+
Pages are accumulated in memory and written once at the end. OK up to
|
|
4
|
+
~100k tasks (a few hundred MB); cursor pagination handles the server side.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Iterable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def timestamped_filename(now: datetime | None = None) -> str:
|
|
15
|
+
"""Return ``annotations-YYYYMMDD-HHMMSS.json`` for the current local time."""
|
|
16
|
+
t = now or datetime.now()
|
|
17
|
+
return f"annotations-{t:%Y%m%d-%H%M%S}.json"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def write_dataset(
|
|
21
|
+
pages: Iterable[dict[str, Any]],
|
|
22
|
+
output_path: Path,
|
|
23
|
+
) -> dict[str, Any]:
|
|
24
|
+
"""Merge paged responses into a single COCO file at ``output_path``.
|
|
25
|
+
|
|
26
|
+
Returns the assembled dataset dict. Raises ValueError if the stream is
|
|
27
|
+
empty or an annotation references a missing image_id.
|
|
28
|
+
"""
|
|
29
|
+
first: dict[str, Any] | None = None
|
|
30
|
+
images: list[dict[str, Any]] = []
|
|
31
|
+
annotations: list[dict[str, Any]] = []
|
|
32
|
+
# Server numbers per-page from 1; re-key to globally-unique ids so
|
|
33
|
+
# consumers can use image.id as a join key if they want.
|
|
34
|
+
image_id_offset = 0
|
|
35
|
+
|
|
36
|
+
for page in pages:
|
|
37
|
+
if first is None:
|
|
38
|
+
first = page
|
|
39
|
+
page_images = page.get("images") or []
|
|
40
|
+
page_annotations = page.get("annotations") or []
|
|
41
|
+
|
|
42
|
+
old_to_new: dict[int, int] = {}
|
|
43
|
+
for img in page_images:
|
|
44
|
+
new_id = image_id_offset + img["id"]
|
|
45
|
+
old_to_new[img["id"]] = new_id
|
|
46
|
+
img["id"] = new_id
|
|
47
|
+
images.append(img)
|
|
48
|
+
for ann in page_annotations:
|
|
49
|
+
old = ann["image_id"]
|
|
50
|
+
if old not in old_to_new:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Annotation references unknown image_id={old} on page "
|
|
53
|
+
f"starting at offset {image_id_offset}"
|
|
54
|
+
)
|
|
55
|
+
ann["image_id"] = old_to_new[old]
|
|
56
|
+
annotations.append(ann)
|
|
57
|
+
|
|
58
|
+
image_id_offset += len(page_images)
|
|
59
|
+
|
|
60
|
+
if first is None:
|
|
61
|
+
raise ValueError("No pages received from API")
|
|
62
|
+
|
|
63
|
+
# Renumber annotation ids contiguously 1..N across pages.
|
|
64
|
+
for new_id, ann in enumerate(annotations, start=1):
|
|
65
|
+
ann["id"] = new_id
|
|
66
|
+
|
|
67
|
+
dataset: dict[str, Any] = {
|
|
68
|
+
"meta": first.get("meta", {}),
|
|
69
|
+
"info": first.get("info", {}),
|
|
70
|
+
"categories": first.get("categories", []),
|
|
71
|
+
"images": images,
|
|
72
|
+
"annotations": annotations,
|
|
73
|
+
}
|
|
74
|
+
output_path.write_text(json.dumps(dataset, indent=2), encoding="utf-8")
|
|
75
|
+
return dataset
|