insto 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- insto/__init__.py +3 -0
- insto/__main__.py +6 -0
- insto/_redact.py +88 -0
- insto/_version.py +1 -0
- insto/backends/__init__.py +52 -0
- insto/backends/_base.py +135 -0
- insto/backends/_cdn.py +343 -0
- insto/backends/_fake.py +196 -0
- insto/backends/_hiker_map.py +355 -0
- insto/backends/_retry.py +103 -0
- insto/backends/hiker.py +681 -0
- insto/cli.py +547 -0
- insto/commands/__init__.py +55 -0
- insto/commands/_base.py +532 -0
- insto/commands/batch.py +434 -0
- insto/commands/content.py +354 -0
- insto/commands/dossier.py +549 -0
- insto/commands/interactions.py +334 -0
- insto/commands/media.py +329 -0
- insto/commands/network.py +317 -0
- insto/commands/operational.py +272 -0
- insto/commands/profile.py +225 -0
- insto/commands/target.py +65 -0
- insto/commands/watch.py +334 -0
- insto/config.py +245 -0
- insto/exceptions.py +113 -0
- insto/models.py +223 -0
- insto/repl.py +417 -0
- insto/service/__init__.py +6 -0
- insto/service/analytics.py +345 -0
- insto/service/exporter.py +323 -0
- insto/service/facade.py +456 -0
- insto/service/history.py +580 -0
- insto/service/watch.py +227 -0
- insto/ui/__init__.py +29 -0
- insto/ui/banner.py +195 -0
- insto/ui/render.py +198 -0
- insto/ui/theme.py +53 -0
- insto-0.1.0.dist-info/METADATA +188 -0
- insto-0.1.0.dist-info/RECORD +43 -0
- insto-0.1.0.dist-info/WHEEL +4 -0
- insto-0.1.0.dist-info/entry_points.txt +2 -0
- insto-0.1.0.dist-info/licenses/LICENSE +21 -0
insto/__init__.py
ADDED
insto/__main__.py
ADDED
insto/_redact.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Single source of truth for secret redaction.
|
|
2
|
+
|
|
3
|
+
Used by `insto.cli._format_error` (user-facing error strings) and by the
|
|
4
|
+
logging formatter (file-side log output). Anything that may end up in
|
|
5
|
+
front of human eyes — stderr, log files, copy-pasted bug reports — passes
|
|
6
|
+
through `redact_secrets()` first.
|
|
7
|
+
|
|
8
|
+
Patterns covered:
|
|
9
|
+
|
|
10
|
+
- the literal value of `$HIKERAPI_TOKEN` if it is set in the environment;
|
|
11
|
+
- the literal values of any tokens / proxy credentials registered at
|
|
12
|
+
runtime via `register_secret()` (used by the config loader so a token
|
|
13
|
+
loaded from `~/.insto/config.toml` or supplied by `--proxy user:pass@`
|
|
14
|
+
is redacted the same way `$HIKERAPI_TOKEN` is);
|
|
15
|
+
- query-string `signature=` and `token=` parameters in URLs (HikerAPI
|
|
16
|
+
signs every CDN URL — those signatures are short-lived but still
|
|
17
|
+
sensitive);
|
|
18
|
+
- `Authorization: Bearer <token>` style headers if they ever surface
|
|
19
|
+
in an exception message;
|
|
20
|
+
- `proxy://user:pass@host` userinfo segments.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
import re
|
|
27
|
+
import threading
|
|
28
|
+
|
|
29
|
+
_QS_SECRET_RE = re.compile(
|
|
30
|
+
r"((?:^|[?&])(?:signature|token)=)[^&\s'\"]+",
|
|
31
|
+
re.IGNORECASE,
|
|
32
|
+
)
|
|
33
|
+
_BEARER_RE = re.compile(
|
|
34
|
+
r"(Bearer\s+)[A-Za-z0-9._~+/=-]+",
|
|
35
|
+
re.IGNORECASE,
|
|
36
|
+
)
|
|
37
|
+
_PROXY_USERINFO_RE = re.compile(
|
|
38
|
+
r"(\b[a-zA-Z][a-zA-Z0-9+.-]*://)([^:/@\s]+):([^@/\s]+)@",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
_secrets_lock = threading.Lock()
|
|
42
|
+
_registered_secrets: set[str] = set()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def register_secret(value: str | None) -> None:
|
|
46
|
+
"""Add `value` to the runtime redaction set.
|
|
47
|
+
|
|
48
|
+
Safe to call multiple times with the same value. Values shorter than
|
|
49
|
+
4 characters are ignored to avoid pathological matches against common
|
|
50
|
+
substrings. Threadsafe under a small mutex so logging handlers that
|
|
51
|
+
call `redact_secrets` concurrently never see a torn set.
|
|
52
|
+
"""
|
|
53
|
+
if not value or len(value) < 4:
|
|
54
|
+
return
|
|
55
|
+
with _secrets_lock:
|
|
56
|
+
_registered_secrets.add(value)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def clear_registered_secrets() -> None:
|
|
60
|
+
"""Drop every value registered via `register_secret`. Useful in tests."""
|
|
61
|
+
with _secrets_lock:
|
|
62
|
+
_registered_secrets.clear()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def redact_secrets(text: str) -> str:
|
|
66
|
+
"""Return `text` with known secret-shaped substrings replaced with `***`.
|
|
67
|
+
|
|
68
|
+
Stable: never raises. Threadsafe; the registered-secrets set is read
|
|
69
|
+
under a short mutex.
|
|
70
|
+
"""
|
|
71
|
+
if not text:
|
|
72
|
+
return text
|
|
73
|
+
redacted = text
|
|
74
|
+
env_token = os.environ.get("HIKERAPI_TOKEN")
|
|
75
|
+
if env_token and len(env_token) >= 4:
|
|
76
|
+
redacted = redacted.replace(env_token, "***")
|
|
77
|
+
with _secrets_lock:
|
|
78
|
+
registered = tuple(_registered_secrets)
|
|
79
|
+
for secret in registered:
|
|
80
|
+
if secret and secret in redacted:
|
|
81
|
+
redacted = redacted.replace(secret, "***")
|
|
82
|
+
redacted = _PROXY_USERINFO_RE.sub(r"\1***:***@", redacted)
|
|
83
|
+
redacted = _QS_SECRET_RE.sub(r"\1***", redacted)
|
|
84
|
+
redacted = _BEARER_RE.sub(r"\1***", redacted)
|
|
85
|
+
return redacted
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
__all__ = ["clear_registered_secrets", "redact_secrets", "register_secret"]
|
insto/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Backend factory.
|
|
2
|
+
|
|
3
|
+
`make_backend(name, **opts)` is the single entry point used by the service
|
|
4
|
+
facade to construct a backend. Concrete backend modules are imported lazily
|
|
5
|
+
so that pulling in `insto.backends` does not pay the cost (and surface the
|
|
6
|
+
runtime dependency footprint) of every backend at once.
|
|
7
|
+
|
|
8
|
+
Practically: `import insto` does not import `hikerapi`. Only the
|
|
9
|
+
`make_backend("hiker", ...)` call does — and that import lives inside the
|
|
10
|
+
function body.
|
|
11
|
+
|
|
12
|
+
Setting `INSTO_BACKEND=fake` in the environment overrides the requested
|
|
13
|
+
name with `"fake"` — a self-contained, network-free backend used by E2E
|
|
14
|
+
tests. The override is intentionally global so the same CLI / REPL entry
|
|
15
|
+
points the user runs are exercised end-to-end without test-only patches.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from insto.backends._base import OSINTBackend
|
|
24
|
+
|
|
25
|
+
BACKEND_OVERRIDE_ENV = "INSTO_BACKEND"
|
|
26
|
+
|
|
27
|
+
__all__ = ["BACKEND_OVERRIDE_ENV", "OSINTBackend", "make_backend"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def make_backend(name: str, **opts: Any) -> OSINTBackend:
|
|
31
|
+
"""Construct a backend by short name.
|
|
32
|
+
|
|
33
|
+
Known names:
|
|
34
|
+
"hiker" — `HikerBackend` (HikerAPI SDK). Imports `hikerapi` lazily.
|
|
35
|
+
"fake" — `FakeBackendProd`, hardcoded in-process data for E2E
|
|
36
|
+
tests. Selected when `INSTO_BACKEND=fake` is set even if
|
|
37
|
+
the caller asked for another backend.
|
|
38
|
+
|
|
39
|
+
Raises `ValueError` for unknown backend names.
|
|
40
|
+
"""
|
|
41
|
+
override = os.environ.get(BACKEND_OVERRIDE_ENV)
|
|
42
|
+
if override:
|
|
43
|
+
name = override
|
|
44
|
+
if name == "hiker":
|
|
45
|
+
from insto.backends.hiker import HikerBackend
|
|
46
|
+
|
|
47
|
+
return HikerBackend(**opts)
|
|
48
|
+
if name == "fake":
|
|
49
|
+
from insto.backends._fake import FakeBackendProd
|
|
50
|
+
|
|
51
|
+
return FakeBackendProd(**opts)
|
|
52
|
+
raise ValueError(f"unknown backend: {name!r}")
|
insto/backends/_base.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Abstract OSINT backend interface.
|
|
2
|
+
|
|
3
|
+
`OSINTBackend` is the contract every backend (HikerAPI v0.1, aiograpi v0.2,
|
|
4
|
+
future TikTok / Bluesky / Threads providers) must implement. The command and
|
|
5
|
+
service layers depend on this ABC, never on a concrete backend — that is what
|
|
6
|
+
keeps v0.2 a pure addition.
|
|
7
|
+
|
|
8
|
+
All collection-returning methods are async generators (`AsyncIterator[T]`)
|
|
9
|
+
with an optional `limit: int | None` parameter. Cursors / page tokens are an
|
|
10
|
+
internal implementation detail of each backend and never leak above this
|
|
11
|
+
layer.
|
|
12
|
+
|
|
13
|
+
The methods raise exceptions from `insto.exceptions` exclusively; raw HTTP /
|
|
14
|
+
SDK errors must be mapped to the taxonomy by the backend itself.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from collections.abc import AsyncIterator
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from insto.models import (
|
|
24
|
+
Comment,
|
|
25
|
+
Highlight,
|
|
26
|
+
HighlightItem,
|
|
27
|
+
Post,
|
|
28
|
+
Profile,
|
|
29
|
+
Quota,
|
|
30
|
+
Story,
|
|
31
|
+
User,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OSINTBackend(ABC):
|
|
36
|
+
"""Async OSINT data source for one social platform.
|
|
37
|
+
|
|
38
|
+
Implementations are expected to be safe for concurrent use within a single
|
|
39
|
+
asyncio event loop (the REPL drives one loop and may dispatch watch tasks
|
|
40
|
+
in parallel). They are NOT required to be process-safe.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
# Capability tokens this backend exposes. Commands declare what they need
|
|
44
|
+
# via `@command(..., requires=("followed",))`; the dispatcher rejects the
|
|
45
|
+
# call when the active backend does not advertise the required tokens.
|
|
46
|
+
# HikerAPI exposes only public OSINT, so the default is empty; an
|
|
47
|
+
# `aiograpi` backend would extend this with `{"followed", ...}`.
|
|
48
|
+
capabilities: frozenset[str] = frozenset()
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
async def resolve_target(self, username: str) -> str:
|
|
52
|
+
"""Return the stable `pk` for `username`, or raise `ProfileNotFound`."""
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
async def get_profile(self, pk: str) -> Profile:
|
|
56
|
+
"""Fetch the full profile DTO for `pk`."""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
async def get_user_about(self, pk: str) -> dict[str, Any]:
|
|
60
|
+
"""Fetch the `user_about` payload (verification, dates, links)."""
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def iter_user_posts(self, pk: str, *, limit: int | None = None) -> AsyncIterator[Post]:
|
|
64
|
+
"""Iterate the user's feed posts in reverse chronological order."""
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def iter_user_followers(self, pk: str, *, limit: int | None = None) -> AsyncIterator[User]:
|
|
68
|
+
"""Iterate the user's followers."""
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def iter_user_following(self, pk: str, *, limit: int | None = None) -> AsyncIterator[User]:
|
|
72
|
+
"""Iterate accounts the user is following."""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def iter_user_tagged(self, pk: str, *, limit: int | None = None) -> AsyncIterator[Post]:
|
|
76
|
+
"""Iterate posts the user is tagged in."""
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def iter_user_highlights(
|
|
80
|
+
self, pk: str, *, limit: int | None = None
|
|
81
|
+
) -> AsyncIterator[Highlight]:
|
|
82
|
+
"""Iterate highlight reels owned by the user."""
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def iter_highlight_items(
|
|
86
|
+
self, highlight_id: str, *, limit: int | None = None
|
|
87
|
+
) -> AsyncIterator[HighlightItem]:
|
|
88
|
+
"""Iterate items inside a highlight reel."""
|
|
89
|
+
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def iter_post_comments(
|
|
92
|
+
self, media_pk: str, *, limit: int | None = None
|
|
93
|
+
) -> AsyncIterator[Comment]:
|
|
94
|
+
"""Iterate comments on a post."""
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def iter_post_likers(self, media_pk: str, *, limit: int | None = None) -> AsyncIterator[User]:
|
|
98
|
+
"""Iterate users who liked a post."""
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def iter_user_stories(self, pk: str, *, limit: int | None = None) -> AsyncIterator[Story]:
|
|
102
|
+
"""Iterate currently-active stories of a user."""
|
|
103
|
+
|
|
104
|
+
@abstractmethod
|
|
105
|
+
async def get_suggested(self, pk: str) -> list[User]:
|
|
106
|
+
"""Fetch accounts suggested as similar to `pk`."""
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def iter_hashtag_posts(self, tag: str, *, limit: int | None = None) -> AsyncIterator[Post]:
|
|
110
|
+
"""Iterate top / recent posts under a hashtag."""
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def get_quota(self) -> Quota:
|
|
114
|
+
"""Return the last-known quota state for the backend."""
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def get_last_error(self) -> BaseException | None:
|
|
118
|
+
"""Return the last exception raised by this backend, if any."""
|
|
119
|
+
|
|
120
|
+
def get_schema_drift_count(self) -> int:
|
|
121
|
+
"""Return the number of `SchemaDrift` errors observed this session.
|
|
122
|
+
|
|
123
|
+
Default 0 so simple backends (in-process fakes) need not track. Real
|
|
124
|
+
backends override to expose a running counter — surfaced by `/health`
|
|
125
|
+
so an operator can spot provider degradation.
|
|
126
|
+
"""
|
|
127
|
+
return 0
|
|
128
|
+
|
|
129
|
+
async def aclose(self) -> None: # noqa: B027 — intentional empty default
|
|
130
|
+
"""Release backend-owned resources (HTTP clients, sockets, …).
|
|
131
|
+
|
|
132
|
+
Default implementation is a no-op so simple in-memory backends (the
|
|
133
|
+
test fakes, future mock backends) need not override. Real backends
|
|
134
|
+
with network clients (HikerBackend) override to close them.
|
|
135
|
+
"""
|
insto/backends/_cdn.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""CDN streamer with defense in depth.
|
|
2
|
+
|
|
3
|
+
Single helper used by the service facade to download CDN-hosted media
|
|
4
|
+
(profile pictures, posts, stories, highlights) into a target directory.
|
|
5
|
+
All defenses live here so that command and service code never touch raw
|
|
6
|
+
HTTP for media.
|
|
7
|
+
|
|
8
|
+
Defenses (each one is exercised by tests/test_cdn.py):
|
|
9
|
+
|
|
10
|
+
- HTTPS-only (http:// rejected, http:// redirects rejected)
|
|
11
|
+
- Host allowlist (only `*.cdninstagram.com` and `*.fbcdn.net` accepted, both
|
|
12
|
+
for the initial URL and any redirect target)
|
|
13
|
+
- Filename built from the caller-supplied ``dest`` (intended to be the post
|
|
14
|
+
pk or similar stable id) plus an extension chosen from response
|
|
15
|
+
Content-Type cross-checked against magic-byte sniffing — the CDN-supplied
|
|
16
|
+
filename in the URL is ignored entirely
|
|
17
|
+
- Whitelist of extensions (.jpg/.jpeg/.png/.webp/.mp4/.mov)
|
|
18
|
+
- Per-resource byte budget (default 500 MB)
|
|
19
|
+
- Pre-flight free-disk check (default 1 GB minimum)
|
|
20
|
+
- Atomic write: stream to ``<final>.part``, fsync, rename → final
|
|
21
|
+
- Collision suffix: never overwrites an existing file; appends ``_<n>``
|
|
22
|
+
- ``mtime`` is set from ``taken_at`` if supplied (so file dates match post
|
|
23
|
+
capture time, not download time)
|
|
24
|
+
- macOS xattr tagging via ctypes ``setxattr(2)`` — adds
|
|
25
|
+
``com.apple.metadata:kMDItemUserTags = insto`` so downloaded media is
|
|
26
|
+
visible in Finder Smart Folders. No-op on non-darwin. On darwin, an
|
|
27
|
+
``OSError`` (e.g. NFS, exFAT) is reported once-per-process to stderr
|
|
28
|
+
and otherwise silently swallowed.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import contextlib
|
|
34
|
+
import ctypes
|
|
35
|
+
import ctypes.util
|
|
36
|
+
import os
|
|
37
|
+
import shutil
|
|
38
|
+
import sys
|
|
39
|
+
from datetime import datetime
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
from urllib.parse import urljoin, urlparse
|
|
42
|
+
|
|
43
|
+
import httpx
|
|
44
|
+
|
|
45
|
+
from insto.exceptions import BackendError
|
|
46
|
+
|
|
47
|
+
ALLOWED_HOST_SUFFIXES: tuple[str, ...] = ("cdninstagram.com", "fbcdn.net")
|
|
48
|
+
ALLOWED_EXTENSIONS: frozenset[str] = frozenset({".jpg", ".jpeg", ".png", ".webp", ".mp4", ".mov"})
|
|
49
|
+
|
|
50
|
+
CT_TO_EXT: dict[str, str] = {
|
|
51
|
+
"image/jpeg": ".jpg",
|
|
52
|
+
"image/jpg": ".jpg",
|
|
53
|
+
"image/png": ".png",
|
|
54
|
+
"image/webp": ".webp",
|
|
55
|
+
"video/mp4": ".mp4",
|
|
56
|
+
"video/quicktime": ".mov",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
DEFAULT_BYTE_BUDGET: int = 500 * 1024 * 1024
|
|
60
|
+
DEFAULT_MIN_FREE_DISK: int = 1024 * 1024 * 1024
|
|
61
|
+
SNIFF_SIZE: int = 512
|
|
62
|
+
CHUNK_SIZE: int = 64 * 1024
|
|
63
|
+
MAX_REDIRECTS: int = 5
|
|
64
|
+
DEFAULT_TIMEOUT: float = 30.0
|
|
65
|
+
|
|
66
|
+
_XATTR_NAME: bytes = b"com.apple.metadata:kMDItemUserTags"
|
|
67
|
+
_XATTR_VALUE: bytes = b"insto"
|
|
68
|
+
_XATTR_WARN_LINE: str = "note: filesystem does not support xattr; tagging skipped"
|
|
69
|
+
|
|
70
|
+
_xattr_warned: bool = False
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _is_host_allowed(host: str) -> bool:
|
|
74
|
+
host = host.lower()
|
|
75
|
+
return any(host == s or host.endswith("." + s) for s in ALLOWED_HOST_SUFFIXES)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _normalize_ct(ct: str | None) -> str | None:
|
|
79
|
+
if not ct:
|
|
80
|
+
return None
|
|
81
|
+
return ct.split(";", 1)[0].strip().lower() or None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _sniff(prefix: bytes) -> tuple[str, str] | None:
|
|
85
|
+
"""Sniff magic bytes; return (extension, mime) or None."""
|
|
86
|
+
if prefix.startswith(b"\xff\xd8\xff"):
|
|
87
|
+
return ".jpg", "image/jpeg"
|
|
88
|
+
if prefix.startswith(b"\x89PNG\r\n\x1a\n"):
|
|
89
|
+
return ".png", "image/png"
|
|
90
|
+
if prefix.startswith(b"RIFF") and len(prefix) >= 12 and prefix[8:12] == b"WEBP":
|
|
91
|
+
return ".webp", "image/webp"
|
|
92
|
+
if len(prefix) >= 12 and prefix[4:8] == b"ftyp":
|
|
93
|
+
brand = prefix[8:12]
|
|
94
|
+
if brand == b"qt ":
|
|
95
|
+
return ".mov", "video/quicktime"
|
|
96
|
+
return ".mp4", "video/mp4"
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _ct_compatible(declared: str, sniffed: str) -> bool:
|
|
101
|
+
if declared == sniffed:
|
|
102
|
+
return True
|
|
103
|
+
return {declared, sniffed} <= {"image/jpeg", "image/jpg"}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _resolve_collision(base: Path) -> Path:
|
|
107
|
+
"""Return a non-existing path; if ``base`` exists, append ``_1``, ``_2`` …"""
|
|
108
|
+
if not base.exists():
|
|
109
|
+
return base
|
|
110
|
+
stem, suffix, parent = base.stem, base.suffix, base.parent
|
|
111
|
+
n = 1
|
|
112
|
+
while True:
|
|
113
|
+
candidate = parent / f"{stem}_{n}{suffix}"
|
|
114
|
+
if not candidate.exists():
|
|
115
|
+
return candidate
|
|
116
|
+
n += 1
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _set_macos_tag(path: Path) -> None:
|
|
120
|
+
"""Tag ``path`` with ``com.apple.metadata:kMDItemUserTags=insto`` on darwin.
|
|
121
|
+
|
|
122
|
+
No-op on other platforms. On darwin, errors from the underlying
|
|
123
|
+
``setxattr(2)`` (e.g. filesystem without xattr support such as NFS or
|
|
124
|
+
exFAT) are swallowed and reported once per process to stderr.
|
|
125
|
+
"""
|
|
126
|
+
global _xattr_warned
|
|
127
|
+
if sys.platform != "darwin":
|
|
128
|
+
return
|
|
129
|
+
libc_path = ctypes.util.find_library("c")
|
|
130
|
+
if libc_path is None: # pragma: no cover - libc is always present on darwin
|
|
131
|
+
return
|
|
132
|
+
libc = ctypes.CDLL(libc_path, use_errno=True)
|
|
133
|
+
libc.setxattr.argtypes = [
|
|
134
|
+
ctypes.c_char_p,
|
|
135
|
+
ctypes.c_char_p,
|
|
136
|
+
ctypes.c_void_p,
|
|
137
|
+
ctypes.c_size_t,
|
|
138
|
+
ctypes.c_uint32,
|
|
139
|
+
ctypes.c_int,
|
|
140
|
+
]
|
|
141
|
+
libc.setxattr.restype = ctypes.c_int
|
|
142
|
+
rc = libc.setxattr(
|
|
143
|
+
str(path).encode("utf-8"),
|
|
144
|
+
_XATTR_NAME,
|
|
145
|
+
_XATTR_VALUE,
|
|
146
|
+
len(_XATTR_VALUE),
|
|
147
|
+
0,
|
|
148
|
+
0,
|
|
149
|
+
)
|
|
150
|
+
if rc != 0 and not _xattr_warned:
|
|
151
|
+
_xattr_warned = True
|
|
152
|
+
print(_XATTR_WARN_LINE, file=sys.stderr)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _validate_url(url: str) -> None:
|
|
156
|
+
parsed = urlparse(url)
|
|
157
|
+
if parsed.scheme != "https":
|
|
158
|
+
raise BackendError(f"non-https CDN url rejected: {url}")
|
|
159
|
+
if not parsed.hostname or not _is_host_allowed(parsed.hostname):
|
|
160
|
+
raise BackendError(f"CDN host not in allowlist: {parsed.hostname}")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _validate_redirect(current: str, location: str) -> str:
|
|
164
|
+
new_url = urljoin(current, location)
|
|
165
|
+
parsed = urlparse(new_url)
|
|
166
|
+
if parsed.scheme != "https":
|
|
167
|
+
raise BackendError(f"CDN redirect to non-https rejected: {new_url}")
|
|
168
|
+
if not parsed.hostname or not _is_host_allowed(parsed.hostname):
|
|
169
|
+
raise BackendError(f"CDN cross-host redirect rejected: {parsed.hostname}")
|
|
170
|
+
return new_url
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _decide_extension(declared_ct: str | None, sniffed: tuple[str, str]) -> str:
|
|
174
|
+
sniff_ext, sniff_mime = sniffed
|
|
175
|
+
if declared_ct is not None:
|
|
176
|
+
if not _ct_compatible(declared_ct, sniff_mime):
|
|
177
|
+
raise BackendError(
|
|
178
|
+
f"CDN content-type mismatch: header={declared_ct} sniff={sniff_mime}"
|
|
179
|
+
)
|
|
180
|
+
ext = CT_TO_EXT.get(declared_ct, sniff_ext)
|
|
181
|
+
else:
|
|
182
|
+
ext = sniff_ext
|
|
183
|
+
if ext not in ALLOWED_EXTENSIONS:
|
|
184
|
+
raise BackendError(f"CDN extension not in allowlist: {ext}")
|
|
185
|
+
return ext
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _coerce_taken_at(taken_at: datetime | float | int | None) -> float | None:
|
|
189
|
+
if taken_at is None:
|
|
190
|
+
return None
|
|
191
|
+
if isinstance(taken_at, datetime):
|
|
192
|
+
return taken_at.timestamp()
|
|
193
|
+
return float(taken_at)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
async def stream_to_file(
|
|
197
|
+
url: str,
|
|
198
|
+
dest: Path,
|
|
199
|
+
*,
|
|
200
|
+
content_type_hint: str | None = None,
|
|
201
|
+
byte_budget: int = DEFAULT_BYTE_BUDGET,
|
|
202
|
+
taken_at: datetime | float | int | None = None,
|
|
203
|
+
client: httpx.AsyncClient | None = None,
|
|
204
|
+
min_free_disk: int = DEFAULT_MIN_FREE_DISK,
|
|
205
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
206
|
+
) -> Path:
|
|
207
|
+
"""Stream a CDN URL to ``dest`` (path *without* extension).
|
|
208
|
+
|
|
209
|
+
The chosen extension comes from the response Content-Type, cross-checked
|
|
210
|
+
against magic-byte sniffing of the leading bytes; if both the header and
|
|
211
|
+
sniff agree, that extension is appended to ``dest``. If the resulting
|
|
212
|
+
path already exists, ``_1`` / ``_2`` … is appended before the extension.
|
|
213
|
+
|
|
214
|
+
Returns the path actually written.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
_validate_url(url)
|
|
218
|
+
|
|
219
|
+
parent = dest.parent
|
|
220
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
|
|
222
|
+
free = shutil.disk_usage(parent).free
|
|
223
|
+
if free < min_free_disk:
|
|
224
|
+
raise BackendError(f"insufficient disk space: {free} bytes free < {min_free_disk}")
|
|
225
|
+
|
|
226
|
+
owns_client = client is None
|
|
227
|
+
if client is None:
|
|
228
|
+
client = httpx.AsyncClient(follow_redirects=False, timeout=timeout)
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
return await _download_with_redirects(
|
|
232
|
+
url=url,
|
|
233
|
+
dest=dest,
|
|
234
|
+
content_type_hint=content_type_hint,
|
|
235
|
+
byte_budget=byte_budget,
|
|
236
|
+
taken_at=_coerce_taken_at(taken_at),
|
|
237
|
+
client=client,
|
|
238
|
+
)
|
|
239
|
+
finally:
|
|
240
|
+
if owns_client:
|
|
241
|
+
await client.aclose()
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
async def _download_with_redirects(
|
|
245
|
+
*,
|
|
246
|
+
url: str,
|
|
247
|
+
dest: Path,
|
|
248
|
+
content_type_hint: str | None,
|
|
249
|
+
byte_budget: int,
|
|
250
|
+
taken_at: float | None,
|
|
251
|
+
client: httpx.AsyncClient,
|
|
252
|
+
) -> Path:
|
|
253
|
+
current_url = url
|
|
254
|
+
redirects = 0
|
|
255
|
+
while True:
|
|
256
|
+
async with client.stream("GET", current_url) as resp:
|
|
257
|
+
if 300 <= resp.status_code < 400:
|
|
258
|
+
if redirects >= MAX_REDIRECTS:
|
|
259
|
+
raise BackendError("too many CDN redirects")
|
|
260
|
+
redirects += 1
|
|
261
|
+
location = resp.headers.get("location")
|
|
262
|
+
if not location:
|
|
263
|
+
raise BackendError("CDN redirect without Location header")
|
|
264
|
+
current_url = _validate_redirect(current_url, location)
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
if resp.status_code != 200:
|
|
268
|
+
raise BackendError(f"CDN GET failed: HTTP {resp.status_code}")
|
|
269
|
+
|
|
270
|
+
declared_ct = _normalize_ct(resp.headers.get("content-type")) or _normalize_ct(
|
|
271
|
+
content_type_hint
|
|
272
|
+
)
|
|
273
|
+
return await _stream_response(
|
|
274
|
+
resp=resp,
|
|
275
|
+
dest=dest,
|
|
276
|
+
declared_ct=declared_ct,
|
|
277
|
+
byte_budget=byte_budget,
|
|
278
|
+
taken_at=taken_at,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
async def _stream_response(
|
|
283
|
+
*,
|
|
284
|
+
resp: httpx.Response,
|
|
285
|
+
dest: Path,
|
|
286
|
+
declared_ct: str | None,
|
|
287
|
+
byte_budget: int,
|
|
288
|
+
taken_at: float | None,
|
|
289
|
+
) -> Path:
|
|
290
|
+
parent = dest.parent
|
|
291
|
+
tmp_path = parent / (dest.name + ".part")
|
|
292
|
+
if tmp_path.exists():
|
|
293
|
+
tmp_path.unlink()
|
|
294
|
+
|
|
295
|
+
sniff_buf = bytearray()
|
|
296
|
+
sniffed: tuple[str, str] | None = None
|
|
297
|
+
total = 0
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
with open(tmp_path, "wb") as fh:
|
|
301
|
+
async for chunk in resp.aiter_bytes(CHUNK_SIZE):
|
|
302
|
+
if not chunk:
|
|
303
|
+
continue
|
|
304
|
+
if sniffed is None and len(sniff_buf) < SNIFF_SIZE:
|
|
305
|
+
sniff_buf.extend(chunk[: SNIFF_SIZE - len(sniff_buf)])
|
|
306
|
+
if len(sniff_buf) >= SNIFF_SIZE:
|
|
307
|
+
sniffed = _sniff(bytes(sniff_buf))
|
|
308
|
+
if sniffed is None:
|
|
309
|
+
raise BackendError("CDN content-type sniff failed: unknown magic bytes")
|
|
310
|
+
if declared_ct is not None and not _ct_compatible(declared_ct, sniffed[1]):
|
|
311
|
+
raise BackendError(
|
|
312
|
+
"CDN content-type mismatch: "
|
|
313
|
+
f"header={declared_ct} sniff={sniffed[1]}"
|
|
314
|
+
)
|
|
315
|
+
if total + len(chunk) > byte_budget:
|
|
316
|
+
raise BackendError(f"CDN response exceeded byte budget {byte_budget}")
|
|
317
|
+
fh.write(chunk)
|
|
318
|
+
total += len(chunk)
|
|
319
|
+
|
|
320
|
+
if sniffed is None:
|
|
321
|
+
if not sniff_buf:
|
|
322
|
+
raise BackendError("CDN response was empty")
|
|
323
|
+
sniffed = _sniff(bytes(sniff_buf))
|
|
324
|
+
if sniffed is None:
|
|
325
|
+
raise BackendError("CDN content-type sniff failed: unknown magic bytes")
|
|
326
|
+
fh.flush()
|
|
327
|
+
os.fsync(fh.fileno())
|
|
328
|
+
|
|
329
|
+
ext = _decide_extension(declared_ct, sniffed)
|
|
330
|
+
final_base = parent / (dest.name + ext)
|
|
331
|
+
final = _resolve_collision(final_base)
|
|
332
|
+
os.rename(tmp_path, final)
|
|
333
|
+
|
|
334
|
+
if taken_at is not None:
|
|
335
|
+
os.utime(final, (taken_at, taken_at))
|
|
336
|
+
|
|
337
|
+
_set_macos_tag(final)
|
|
338
|
+
return final
|
|
339
|
+
except BaseException:
|
|
340
|
+
if tmp_path.exists():
|
|
341
|
+
with contextlib.suppress(OSError):
|
|
342
|
+
tmp_path.unlink()
|
|
343
|
+
raise
|