durable-sync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- durable_sync/__init__.py +26 -0
- durable_sync/activities.py +156 -0
- durable_sync/auth/__init__.py +8 -0
- durable_sync/auth/oauth/__init__.py +18 -0
- durable_sync/auth/oauth/flow.py +183 -0
- durable_sync/auth/oauth/refresh.py +58 -0
- durable_sync/auth/oauth/store.py +36 -0
- durable_sync/auth/oauth/token.py +36 -0
- durable_sync/auth/oauth/workflow.py +172 -0
- durable_sync/bootstrap.py +44 -0
- durable_sync/codec.py +80 -0
- durable_sync/config.py +35 -0
- durable_sync/connectors/__init__.py +14 -0
- durable_sync/connectors/asana/__init__.py +13 -0
- durable_sync/connectors/asana/destination.py +213 -0
- durable_sync/connectors/content.py +80 -0
- durable_sync/connectors/contentful/__init__.py +25 -0
- durable_sync/connectors/contentful/api.py +285 -0
- durable_sync/connectors/contentful/bootstrap.py +102 -0
- durable_sync/connectors/contentful/describe.py +61 -0
- durable_sync/connectors/contentful/destination.py +145 -0
- durable_sync/connectors/contentful/encode.py +49 -0
- durable_sync/connectors/contentful/introspect.py +69 -0
- durable_sync/connectors/contentful/mcp.py +95 -0
- durable_sync/connectors/contentful/mcp_destination.py +137 -0
- durable_sync/connectors/contentful/oauth.py +27 -0
- durable_sync/connectors/contentful/prove.py +51 -0
- durable_sync/connectors/contentful/source.py +192 -0
- durable_sync/connectors/contentful/start.py +46 -0
- durable_sync/connectors/contentful/store.py +25 -0
- durable_sync/connectors/contentful/token.py +13 -0
- durable_sync/connectors/contentful/token_check.py +42 -0
- durable_sync/connectors/github/__init__.py +33 -0
- durable_sync/connectors/github/api.py +169 -0
- durable_sync/connectors/github/source.py +230 -0
- durable_sync/connectors/luma/__init__.py +20 -0
- durable_sync/connectors/luma/api.py +121 -0
- durable_sync/connectors/luma/destination.py +128 -0
- durable_sync/connectors/luma/source.py +155 -0
- durable_sync/connectors/multi.py +78 -0
- durable_sync/connectors/notion/__init__.py +20 -0
- durable_sync/connectors/notion/bootstrap.py +97 -0
- durable_sync/connectors/notion/client.py +133 -0
- durable_sync/connectors/notion/destination.py +270 -0
- durable_sync/connectors/notion/oauth.py +25 -0
- durable_sync/connectors/notion/prove.py +57 -0
- durable_sync/connectors/notion/source.py +136 -0
- durable_sync/connectors/notion/start.py +46 -0
- durable_sync/connectors/notion/store.py +25 -0
- durable_sync/connectors/notion/token.py +13 -0
- durable_sync/connectors/youtube/__init__.py +13 -0
- durable_sync/connectors/youtube/api.py +122 -0
- durable_sync/connectors/youtube/source.py +152 -0
- durable_sync/core.py +210 -0
- durable_sync/env.py +55 -0
- durable_sync/http.py +71 -0
- durable_sync/linkstore.py +88 -0
- durable_sync/route.py +86 -0
- durable_sync/temporal_client.py +48 -0
- durable_sync/transport/__init__.py +12 -0
- durable_sync/transport/mcp.py +77 -0
- durable_sync/worker.py +109 -0
- durable_sync/workflows/__init__.py +9 -0
- durable_sync/workflows/sync.py +208 -0
- durable_sync-0.1.0.dist-info/METADATA +310 -0
- durable_sync-0.1.0.dist-info/RECORD +69 -0
- durable_sync-0.1.0.dist-info/WHEEL +5 -0
- durable_sync-0.1.0.dist-info/licenses/LICENSE +21 -0
- durable_sync-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""YouTube Data API v3 helpers — pure async HTTP + small pure transforms. No
|
|
2
|
+
Temporal, no config globals. Lists a channel's "uploads" playlist (channel id or
|
|
3
|
+
@handle, resolved here), yielding {videoId, title, description, publishedAt,
|
|
4
|
+
viewCount}. Read-only.
|
|
5
|
+
|
|
6
|
+
Docs: https://developers.google.com/youtube/v3/docs
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
from durable_sync.http import request_with_retry
|
|
17
|
+
|
|
18
|
+
API = "https://www.googleapis.com/youtube/v3"
|
|
19
|
+
PAGE = 50 # API max
|
|
20
|
+
log = logging.getLogger("durable_sync.connectors.youtube")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
async def _get(client: httpx.AsyncClient, api_key: str, path: str, params: dict[str, Any]) -> dict[str, Any]:
|
|
24
|
+
r = await request_with_retry(client, "GET", f"{API}/{path}", params={**params, "key": api_key})
|
|
25
|
+
r.raise_for_status()
|
|
26
|
+
return r.json()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_ts(ts: str) -> datetime:
|
|
30
|
+
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def uploads_playlist(client: httpx.AsyncClient, api_key: str, channel: str) -> str:
|
|
34
|
+
"""Resolve a channel id ('UC…') or '@handle' to its uploads playlist id."""
|
|
35
|
+
ch = channel.strip()
|
|
36
|
+
if ch.startswith("UC") and " " not in ch:
|
|
37
|
+
data = await _get(client, api_key, "channels", {"part": "contentDetails", "id": ch})
|
|
38
|
+
else:
|
|
39
|
+
data = await _get(client, api_key, "channels", {"part": "contentDetails", "forHandle": ch.lstrip("@")})
|
|
40
|
+
items = data.get("items", [])
|
|
41
|
+
if not items:
|
|
42
|
+
raise RuntimeError(f"YouTube channel not found: {channel!r}")
|
|
43
|
+
return items[0]["contentDetails"]["relatedPlaylists"]["uploads"]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def list_videos_page(
|
|
47
|
+
client: httpx.AsyncClient, api_key: str, playlist: str, after_iso: str, *, page_token: str | None = None
|
|
48
|
+
) -> tuple[list[dict[str, Any]], str | None]:
|
|
49
|
+
"""ONE page of videos published on/after `after_iso` (with view counts), plus
|
|
50
|
+
the next pageToken or None. The uploads playlist is reverse-chronological, so we
|
|
51
|
+
stop (next=None) once a video predates the window — the cursor the spine threads
|
|
52
|
+
through `YouTubeSource.fetch_page`."""
|
|
53
|
+
after = parse_ts(after_iso)
|
|
54
|
+
params: dict[str, Any] = {"part": "snippet,contentDetails", "playlistId": playlist, "maxResults": PAGE}
|
|
55
|
+
if page_token:
|
|
56
|
+
params["pageToken"] = page_token
|
|
57
|
+
data = await _get(client, api_key, "playlistItems", params)
|
|
58
|
+
|
|
59
|
+
metas: list[dict[str, Any]] = []
|
|
60
|
+
stop = False
|
|
61
|
+
for it in data.get("items", []):
|
|
62
|
+
sn, cd = it.get("snippet", {}), it.get("contentDetails", {})
|
|
63
|
+
vid = cd.get("videoId")
|
|
64
|
+
pub = cd.get("videoPublishedAt") or sn.get("publishedAt")
|
|
65
|
+
if not vid or not pub:
|
|
66
|
+
continue
|
|
67
|
+
if parse_ts(pub) < after:
|
|
68
|
+
stop = True
|
|
69
|
+
break
|
|
70
|
+
metas.append({"videoId": vid, "title": sn.get("title", ""),
|
|
71
|
+
"description": sn.get("description", ""), "publishedAt": pub})
|
|
72
|
+
|
|
73
|
+
views = await view_counts(client, api_key, [m["videoId"] for m in metas]) if metas else {}
|
|
74
|
+
for m in metas:
|
|
75
|
+
m["viewCount"] = views.get(m["videoId"])
|
|
76
|
+
|
|
77
|
+
next_token = None if stop else (data.get("nextPageToken") or None)
|
|
78
|
+
return metas, next_token
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def list_videos(client: httpx.AsyncClient, api_key: str, playlist: str, after_iso: str) -> list[dict[str, Any]]:
|
|
82
|
+
"""All videos on/after `after_iso` — drains list_videos_page. Non-Temporal
|
|
83
|
+
callers; the spine pages directly."""
|
|
84
|
+
out: list[dict[str, Any]] = []
|
|
85
|
+
page_token: str | None = None
|
|
86
|
+
while True:
|
|
87
|
+
metas, page_token = await list_videos_page(client, api_key, playlist, after_iso, page_token=page_token)
|
|
88
|
+
out.extend(metas)
|
|
89
|
+
if page_token is None:
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
async def videos_by_id(client: httpx.AsyncClient, api_key: str, ids: list[str]) -> list[dict[str, Any]]:
|
|
94
|
+
"""Specific videos by id (for targeted refreshes), same meta shape as the list."""
|
|
95
|
+
ids = [i for i in ids if i]
|
|
96
|
+
if not ids:
|
|
97
|
+
return []
|
|
98
|
+
data = await _get(client, api_key, "videos", {"part": "snippet,statistics", "id": ",".join(ids[:PAGE])})
|
|
99
|
+
out: list[dict[str, Any]] = []
|
|
100
|
+
for it in data.get("items", []):
|
|
101
|
+
sn, st = it.get("snippet", {}), it.get("statistics", {})
|
|
102
|
+
vc = st.get("viewCount")
|
|
103
|
+
out.append({
|
|
104
|
+
"videoId": it.get("id", ""),
|
|
105
|
+
"title": sn.get("title", ""),
|
|
106
|
+
"description": sn.get("description", ""),
|
|
107
|
+
"publishedAt": sn.get("publishedAt"),
|
|
108
|
+
"viewCount": int(vc) if vc is not None else None,
|
|
109
|
+
})
|
|
110
|
+
return out
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def view_counts(client: httpx.AsyncClient, api_key: str, ids: list[str]) -> dict[str, int | None]:
|
|
114
|
+
"""viewCount per video id (one batched call; ids already <= PAGE)."""
|
|
115
|
+
if not ids:
|
|
116
|
+
return {}
|
|
117
|
+
data = await _get(client, api_key, "videos", {"part": "statistics", "id": ",".join(ids)})
|
|
118
|
+
out: dict[str, int | None] = {}
|
|
119
|
+
for it in data.get("items", []):
|
|
120
|
+
vc = it.get("statistics", {}).get("viewCount")
|
|
121
|
+
out[it["id"]] = int(vc) if vc is not None else None
|
|
122
|
+
return out
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""YouTubeSource — a channel's uploads -> Records, with a source-side enrich hook.
|
|
2
|
+
|
|
3
|
+
YouTube has no per-video author field. The base Record therefore leaves "Author"
|
|
4
|
+
empty and stashes the title + description as a "Scan Text" property (and on the
|
|
5
|
+
enrich context) so an app that needs attribution can scan it for known names —
|
|
6
|
+
an "inverted match" — rather than relying on a structured author. That policy
|
|
7
|
+
lives in your `enrich` hook, not here.
|
|
8
|
+
|
|
9
|
+
Auth: a YouTube Data API v3 key, read from the env var named by
|
|
10
|
+
`YouTubeConfig.token_env`. Requires the `youtube` extra.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import inspect
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from datetime import datetime, timedelta, timezone
|
|
19
|
+
from typing import Awaitable, Callable, Union
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
from temporalio import activity
|
|
23
|
+
|
|
24
|
+
from durable_sync.core import Record, SourceSpec
|
|
25
|
+
from durable_sync.connectors import content
|
|
26
|
+
from durable_sync.connectors.youtube import api
|
|
27
|
+
|
|
28
|
+
log = logging.getLogger("durable_sync.connectors.youtube")
|
|
29
|
+
|
|
30
|
+
EnrichHook = Callable[[Record, "YouTubeVideoContext"], Union[Record, Awaitable[Record]]]
|
|
31
|
+
|
|
32
|
+
_MAX_SUMMARY = 2000
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class YouTubeConfig:
|
|
37
|
+
"""Everything YouTube-specific a deployment supplies. `channel` (required) is a
|
|
38
|
+
channel id ('UC…') or a handle ('@name'); the client resolves a handle to its
|
|
39
|
+
id."""
|
|
40
|
+
channel: str
|
|
41
|
+
token_env: str = "YOUTUBE_API_KEY"
|
|
42
|
+
lookback_days: int = 21
|
|
43
|
+
interval_minutes: int = 360
|
|
44
|
+
title_property: str = "Name"
|
|
45
|
+
item_type: str = "Video" # value written to the neutral "Type" column
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class YouTubeVideoContext:
|
|
50
|
+
"""Handed to the enrich hook: the raw video meta (incl. full description) plus
|
|
51
|
+
the live client, so enrich can do inverted name-matching or extra lookups."""
|
|
52
|
+
raw_video: dict
|
|
53
|
+
scan_text: str # title + description, for inverted matching
|
|
54
|
+
client: httpx.AsyncClient
|
|
55
|
+
api_key: str
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _heartbeat(detail: str) -> None:
|
|
59
|
+
if activity.in_activity():
|
|
60
|
+
activity.heartbeat(detail)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class YouTubeSource:
|
|
64
|
+
name = "youtube"
|
|
65
|
+
|
|
66
|
+
def __init__(self, config: YouTubeConfig, *, enrich: EnrichHook | None = None):
|
|
67
|
+
self._config = config
|
|
68
|
+
self._enrich = enrich
|
|
69
|
+
|
|
70
|
+
def specs(self) -> list[SourceSpec]:
|
|
71
|
+
ch = self._config.channel
|
|
72
|
+
return [SourceSpec(key=f"channel:{ch}", interval_minutes=self._config.interval_minutes,
|
|
73
|
+
params={"channel": ch})]
|
|
74
|
+
|
|
75
|
+
async def fetch_page(
|
|
76
|
+
self, spec: SourceSpec, only_items: list[str] | None, cursor: str | None
|
|
77
|
+
) -> tuple[list[Record], str | None]:
|
|
78
|
+
"""ONE page of videos + next cursor (None on the last page). The cursor
|
|
79
|
+
carries the frozen window start, the resolved uploads playlist (so we don't
|
|
80
|
+
re-resolve the channel each page), and YouTube's pageToken. A targeted
|
|
81
|
+
(`only_items`) refresh is bounded, so it returns a single page."""
|
|
82
|
+
cfg = self._config
|
|
83
|
+
api_key = os.environ.get(cfg.token_env, "")
|
|
84
|
+
channel = spec.params.get("channel", cfg.channel)
|
|
85
|
+
|
|
86
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
87
|
+
if only_items:
|
|
88
|
+
videos = await api.videos_by_id(client, api_key, only_items)
|
|
89
|
+
next_cursor = None
|
|
90
|
+
else:
|
|
91
|
+
if cursor is None:
|
|
92
|
+
after_iso = (datetime.now(timezone.utc) - timedelta(days=cfg.lookback_days)).isoformat()
|
|
93
|
+
playlist = await api.uploads_playlist(client, api_key, channel)
|
|
94
|
+
page_token = None
|
|
95
|
+
else:
|
|
96
|
+
c = content.unpack_cursor(cursor)
|
|
97
|
+
after_iso, playlist, page_token = c["after"], c["playlist"], c["token"]
|
|
98
|
+
videos, next_token = await api.list_videos_page(
|
|
99
|
+
client, api_key, playlist, after_iso, page_token=page_token)
|
|
100
|
+
next_cursor = (
|
|
101
|
+
content.pack_cursor(after=after_iso, playlist=playlist, token=next_token)
|
|
102
|
+
if next_token else None
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
out: list[Record] = []
|
|
106
|
+
for v in videos:
|
|
107
|
+
record = self._to_record(v)
|
|
108
|
+
if self._enrich is not None:
|
|
109
|
+
title = v.get("title") or ""
|
|
110
|
+
scan_text = f"{title}\n{v.get('description', '')}"
|
|
111
|
+
ctx = YouTubeVideoContext(raw_video=v, scan_text=scan_text, client=client, api_key=api_key)
|
|
112
|
+
result = self._enrich(record, ctx)
|
|
113
|
+
record = await result if inspect.isawaitable(result) else result
|
|
114
|
+
out.append(record)
|
|
115
|
+
_heartbeat(v.get("videoId", ""))
|
|
116
|
+
|
|
117
|
+
log.info("Fetched %d YouTube videos for %s (cursor=%s)", len(out), spec.key, cursor)
|
|
118
|
+
return out, next_cursor
|
|
119
|
+
|
|
120
|
+
async def fetch(self, spec: SourceSpec, only_items: list[str] | None = None) -> list[Record]:
|
|
121
|
+
"""Whole window as one list — drains fetch_page (standalone/non-Temporal)."""
|
|
122
|
+
records: list[Record] = []
|
|
123
|
+
cursor: str | None = None
|
|
124
|
+
while True:
|
|
125
|
+
page, cursor = await self.fetch_page(spec, only_items, cursor)
|
|
126
|
+
records.extend(page)
|
|
127
|
+
if cursor is None:
|
|
128
|
+
return records
|
|
129
|
+
|
|
130
|
+
def _to_record(self, v: dict) -> Record:
|
|
131
|
+
"""Map one video to a neutral Record. Pure (no IO)."""
|
|
132
|
+
cfg = self._config
|
|
133
|
+
vid = v.get("videoId", "")
|
|
134
|
+
title = v.get("title") or "(untitled video)"
|
|
135
|
+
description = v.get("description") or ""
|
|
136
|
+
return content.content_record(
|
|
137
|
+
primary_key=vid,
|
|
138
|
+
title_property=cfg.title_property,
|
|
139
|
+
title=title,
|
|
140
|
+
item_type=cfg.item_type,
|
|
141
|
+
source="YouTube",
|
|
142
|
+
url=f"https://www.youtube.com/watch?v={vid}" if vid else None,
|
|
143
|
+
date=v.get("publishedAt"),
|
|
144
|
+
status="Published",
|
|
145
|
+
author="", # no per-video author on YouTube
|
|
146
|
+
extra={
|
|
147
|
+
"Reach": v.get("viewCount"),
|
|
148
|
+
"Summary": description[:_MAX_SUMMARY],
|
|
149
|
+
# Free text for inverted matching by an app's enrich/transform hook.
|
|
150
|
+
"Scan Text": f"{title}\n{description}"[:_MAX_SUMMARY],
|
|
151
|
+
},
|
|
152
|
+
)
|
durable_sync/core.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Generic, source/destination-agnostic spine. No I/O here — this module is
|
|
2
|
+
imported into the Temporal workflow sandbox, so it must stay side-effect-free.
|
|
3
|
+
|
|
4
|
+
The whole library reduces to two seams:
|
|
5
|
+
|
|
6
|
+
* a Source produces `Record`s (fetch + map your data),
|
|
7
|
+
* a Destination upserts them idempotently.
|
|
8
|
+
|
|
9
|
+
Everything painful — durable orchestration, idempotent upsert, OAuth refresh,
|
|
10
|
+
pagination, rate-limit backoff, error handling — lives in the spine and is
|
|
11
|
+
inherited for free. To add a source you implement `Source`; to add a
|
|
12
|
+
destination, `Destination`. Reference implementations: GitHub (source), Notion
|
|
13
|
+
and Asana (destinations).
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import datetime as dt
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from typing import Any, AsyncContextManager, Protocol, runtime_checkable
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class Record:
|
|
25
|
+
"""One row to upsert, in DESTINATION-AGNOSTIC form. `properties` values are
|
|
26
|
+
NEUTRAL Python types — the Destination owns wire-encoding, so a Source author
|
|
27
|
+
never learns a destination's quirks (Notion's multi-select JSON, Asana's
|
|
28
|
+
custom fields, etc.):
|
|
29
|
+
|
|
30
|
+
str -> text / url / select / title
|
|
31
|
+
bool -> checkbox
|
|
32
|
+
int | float -> number
|
|
33
|
+
list[str] -> multi-select
|
|
34
|
+
datetime.date -> date datetime.datetime -> datetime
|
|
35
|
+
None -> property omitted
|
|
36
|
+
|
|
37
|
+
`primary_key` is the IMMUTABLE idempotency key (e.g. a repo id), never a
|
|
38
|
+
name/URL — this is what makes at-least-once retries safe. `body` is optional
|
|
39
|
+
long-form content (e.g. a README / task notes), written on create.
|
|
40
|
+
"""
|
|
41
|
+
primary_key: str
|
|
42
|
+
properties: dict[str, Any]
|
|
43
|
+
body: str | None = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class SourceSpec:
|
|
48
|
+
"""One unit of work for a Source, handed to its per-source entity workflow.
|
|
49
|
+
`key` is a stable id used to derive the workflow id. `params` is opaque,
|
|
50
|
+
source-defined config (e.g. {"kind": "org", "org": "temporal-community"})."""
|
|
51
|
+
key: str
|
|
52
|
+
interval_minutes: int = 30
|
|
53
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@runtime_checkable
|
|
57
|
+
class Source(Protocol):
|
|
58
|
+
"""Implement this for your data source. GitHubSource is the reference impl."""
|
|
59
|
+
name: str
|
|
60
|
+
|
|
61
|
+
def specs(self) -> list[SourceSpec]:
|
|
62
|
+
"""One SourceSpec per independent unit (each gets its own workflow)."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
async def fetch(
|
|
66
|
+
self, spec: SourceSpec, only_items: list[str] | None = None
|
|
67
|
+
) -> list[Record]:
|
|
68
|
+
"""Fetch (optionally just `only_items`) and map to Records. All
|
|
69
|
+
source-specific I/O and field-mapping happens here. Returns the WHOLE unit
|
|
70
|
+
in one list — simplest, and fine up to ~hundreds of records.
|
|
71
|
+
|
|
72
|
+
For a source that can return many thousands, implement `fetch_page` (below)
|
|
73
|
+
instead: the spine drives it page-by-page so neither the fetch result nor
|
|
74
|
+
the upsert ever passes through Temporal history as one oversized payload."""
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
# OPTIONAL (checked via getattr by the spine — like the Destination's aux hooks),
|
|
78
|
+
# but PREFERRED: every shipped connector (GitHub/Luma/YouTube/Contentful)
|
|
79
|
+
# implements it, with fetch() as a thin drain over it. The spine calls it
|
|
80
|
+
# repeatedly, threading your cursor, and upserts each page before asking for the
|
|
81
|
+
# next — bounding history regardless of total size. When present it's used in
|
|
82
|
+
# preference to fetch(). A genuinely tiny source can skip it and just implement
|
|
83
|
+
# fetch() (the spine treats that as one page).
|
|
84
|
+
#
|
|
85
|
+
# async def fetch_page(
|
|
86
|
+
# self, spec: SourceSpec, only_items: list[str] | None, cursor: str | None
|
|
87
|
+
# ) -> tuple[list[Record], str | None]:
|
|
88
|
+
# """Return (records_for_this_page, next_cursor). next_cursor is None on
|
|
89
|
+
# the last page. `cursor` is None on the first call. Opaque to the spine —
|
|
90
|
+
# use whatever your API's pagination token is (offset, page no, cursor)."""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class DestinationSession(Protocol):
|
|
94
|
+
"""An open connection to the destination for one sync pass."""
|
|
95
|
+
|
|
96
|
+
async def query_existing_ids(self) -> dict[str, str]:
|
|
97
|
+
"""{ primary_key -> destination-internal id } for rows already present."""
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
async def create(self, record: Record, synced_at: dt.datetime) -> bool:
|
|
101
|
+
"""Insert a new row. `synced_at` is the sync-pass timestamp (a real
|
|
102
|
+
datetime — the destination formats it however its schema needs).
|
|
103
|
+
Returns True if written, False if SKIPPED (e.g. a destination-side enrich
|
|
104
|
+
hook dropped the record as out-of-scope)."""
|
|
105
|
+
...
|
|
106
|
+
|
|
107
|
+
async def update(self, existing_id: str, record: Record, synced_at: dt.datetime) -> bool:
|
|
108
|
+
"""Refresh an existing row, leaving `create_only` properties untouched so
|
|
109
|
+
human edits to those seeds survive. `synced_at` as in create(). Returns
|
|
110
|
+
True if written, False if skipped."""
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Destination(Protocol):
|
|
115
|
+
"""Implement this for your destination. NotionDestination / AsanaDestination
|
|
116
|
+
are the reference impls (MCP+OAuth and REST+PAT respectively — the protocol is
|
|
117
|
+
intentionally neither transport- nor auth-shaped)."""
|
|
118
|
+
name: str
|
|
119
|
+
|
|
120
|
+
# True once the destination has the config it needs to write (e.g. a target
|
|
121
|
+
# id). The spine refuses to sync an unconfigured destination.
|
|
122
|
+
configured: bool
|
|
123
|
+
|
|
124
|
+
# Properties written only on CREATE — enrichment seeds a human refines, never
|
|
125
|
+
# overwritten on update. The mechanism is generic; each Source supplies which
|
|
126
|
+
# fields. Honored by update().
|
|
127
|
+
create_only_properties: set[str]
|
|
128
|
+
|
|
129
|
+
def connect(self) -> AsyncContextManager[DestinationSession]: ...
|
|
130
|
+
|
|
131
|
+
# OPTIONAL hooks (checked via getattr by the worker — don't define if unused):
|
|
132
|
+
# def aux_workflows(self) -> list: ... extra Temporal workflows to register
|
|
133
|
+
# def aux_activities(self) -> list: ... extra activities to register
|
|
134
|
+
# e.g. the Notion destination registers its token-owner auth workflow here.
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def config_hint(self) -> str:
|
|
138
|
+
"""Human-readable hint naming what to set when `configured` is False
|
|
139
|
+
(e.g. an env var). Keeps destination-specific config names out of the
|
|
140
|
+
generic spine's error messages."""
|
|
141
|
+
...
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def is_auth_error(err: BaseException) -> bool:
|
|
145
|
+
"""True if `err` is an auth failure only a human can fix (so the workflow
|
|
146
|
+
pauses instead of hammering). Destination-specific. OPTIONAL: destinations
|
|
147
|
+
with no interactive auth (e.g. a local DB) should just `return False`.
|
|
148
|
+
Most HTTP destinations can delegate to `auth_error_in_chain` below."""
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class DestinationHTTPError(RuntimeError):
|
|
153
|
+
"""An HTTP error from a destination, carrying the numeric `status_code`
|
|
154
|
+
SEPARATELY from the message. `auth_error_in_chain` keys auth-classification on
|
|
155
|
+
this code rather than scanning the (up-to-600-char) response body — where a
|
|
156
|
+
stray standalone "403" in an error payload would spuriously pause the workflow.
|
|
157
|
+
Destinations should raise this (not a bare RuntimeError) for HTTP failures."""
|
|
158
|
+
|
|
159
|
+
def __init__(self, status_code: int, message: str) -> None:
|
|
160
|
+
super().__init__(message)
|
|
161
|
+
self.status_code = status_code
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# Default auth-failure signatures shared by HTTP destinations. The numeric code is
|
|
165
|
+
# taken from a DestinationHTTPError.status_code when present; otherwise (a plain
|
|
166
|
+
# exception) we fall back to a WORD-BOUNDARY text match so a bare "401"/"403"
|
|
167
|
+
# inside a UUID or request-id can't false-positive — the bug that once paused a
|
|
168
|
+
# workflow on a Notion validation_error whose id contained "401e".
|
|
169
|
+
_AUTH_TEXT_NEEDLES = ("unauthorized", "forbidden", "invalid_token", "invalid_grant")
|
|
170
|
+
_AUTH_STATUS_CODES = (401, 403)
|
|
171
|
+
_AUTH_CODE_RE = re.compile(r"\b(401|403)\b")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def auth_error_in_chain(err: BaseException, *, extra_needles: tuple[str, ...] = ()) -> bool:
|
|
175
|
+
"""Shared `is_auth_error` implementation: walk `err`'s cause/context chain and
|
|
176
|
+
any ExceptionGroup, returning True if any link looks like a human-fixable auth
|
|
177
|
+
failure (401/403, unauthorized, forbidden, invalid_token/grant). A destination
|
|
178
|
+
passes `extra_needles` for service-specific phrasings (e.g. Asana's "not
|
|
179
|
+
authorized"). Pure/deterministic — no I/O — so it's safe to import widely.
|
|
180
|
+
|
|
181
|
+
Classification order per link: (1) a DestinationHTTPError's exact status_code;
|
|
182
|
+
(2) a text needle; (3) ONLY when the link carries no status_code, a
|
|
183
|
+
word-boundary 401/403 in the message. (3) is skipped for status-carrying errors
|
|
184
|
+
so a 422/500 whose body mentions "403" isn't misread as auth.
|
|
185
|
+
|
|
186
|
+
This lives in the spine so every destination shares ONE correct matcher
|
|
187
|
+
instead of re-deriving the chain walk + code check (which is exactly where
|
|
188
|
+
Notion and Asana had drifted apart)."""
|
|
189
|
+
needles = _AUTH_TEXT_NEEDLES + tuple(n.lower() for n in extra_needles)
|
|
190
|
+
seen: set[int] = set()
|
|
191
|
+
stack: list[BaseException] = [err]
|
|
192
|
+
while stack:
|
|
193
|
+
cur = stack.pop()
|
|
194
|
+
if id(cur) in seen:
|
|
195
|
+
continue
|
|
196
|
+
seen.add(id(cur))
|
|
197
|
+
status = getattr(cur, "status_code", None)
|
|
198
|
+
msg = str(cur).lower()
|
|
199
|
+
if status in _AUTH_STATUS_CODES:
|
|
200
|
+
return True
|
|
201
|
+
if any(n in msg for n in needles):
|
|
202
|
+
return True
|
|
203
|
+
if status is None and _AUTH_CODE_RE.search(msg):
|
|
204
|
+
return True
|
|
205
|
+
if isinstance(cur, BaseExceptionGroup):
|
|
206
|
+
stack.extend(cur.exceptions)
|
|
207
|
+
for nxt in (cur.__cause__, cur.__context__):
|
|
208
|
+
if nxt is not None:
|
|
209
|
+
stack.append(nxt)
|
|
210
|
+
return False
|
durable_sync/env.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Load a local `.env` into os.environ — dev convenience for `python -m …` tools
|
|
2
|
+
and the live smokes, so each script doesn't roll its own (which led to scripts
|
|
3
|
+
that silently ignored a populated `.env`).
|
|
4
|
+
|
|
5
|
+
Idempotent, never overrides an already-set var, no-op if there's no `.env`. Uses
|
|
6
|
+
python-dotenv when present (the `dev` extra); falls back to a tiny built-in parser
|
|
7
|
+
so it still works in a minimal install. Run scripts from the repo root.
|
|
8
|
+
|
|
9
|
+
NOT imported by config.py / the workflow sandbox — this does file IO, so it stays
|
|
10
|
+
out of the deterministic path and is called explicitly by scripts only.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import stat
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_env(path: str | os.PathLike | None = None) -> None:
|
|
21
|
+
_warn_if_world_readable(Path(path) if path else Path(".env"))
|
|
22
|
+
try:
|
|
23
|
+
from dotenv import load_dotenv
|
|
24
|
+
except ModuleNotFoundError:
|
|
25
|
+
_load_fallback(path)
|
|
26
|
+
return
|
|
27
|
+
load_dotenv(path) if path else load_dotenv()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _warn_if_world_readable(p: Path) -> None:
|
|
31
|
+
"""The `.env` is the documented home for DURABLE_SYNC_ENC_KEY (the AES master
|
|
32
|
+
key) and connector PATs. The token JSON stores are chmod'd 0o600; the `.env`
|
|
33
|
+
must be too, or a local user reads the key that decrypts every token. Warn
|
|
34
|
+
(don't fail — a CI runner with injected env vars may have no `.env`)."""
|
|
35
|
+
try:
|
|
36
|
+
mode = p.stat().st_mode
|
|
37
|
+
except OSError:
|
|
38
|
+
return # no file / not readable — nothing loaded from it
|
|
39
|
+
if mode & (stat.S_IRWXG | stat.S_IRWXO):
|
|
40
|
+
print(
|
|
41
|
+
f"WARNING: {p} is group/other-accessible (mode {oct(mode & 0o777)}); "
|
|
42
|
+
f"it may hold secrets (DURABLE_SYNC_ENC_KEY, PATs). Run: chmod 600 {p}",
|
|
43
|
+
file=sys.stderr,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _load_fallback(path: str | os.PathLike | None) -> None:
|
|
48
|
+
p = Path(path) if path else Path(".env")
|
|
49
|
+
if not p.exists():
|
|
50
|
+
return
|
|
51
|
+
for line in p.read_text().splitlines():
|
|
52
|
+
line = line.strip()
|
|
53
|
+
if line and not line.startswith("#") and "=" in line:
|
|
54
|
+
k, v = line.split("=", 1)
|
|
55
|
+
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
|
durable_sync/http.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Shared httpx retry/backoff for REST sources & destinations.
|
|
2
|
+
|
|
3
|
+
The destinations and the GitHub source all talk HTTP and all need the same
|
|
4
|
+
manners under rate limiting (honor `Retry-After`, otherwise exponential backoff).
|
|
5
|
+
That logic had drifted — Asana honored `Retry-After`, GitHub had no backoff at
|
|
6
|
+
all — so it lives here once. NOT used by the Notion destination: the MCP
|
|
7
|
+
transport surfaces failures as `isError` *results* rather than HTTP statuses, so
|
|
8
|
+
it keeps its own small retry loop in `NotionDestination.call`.
|
|
9
|
+
|
|
10
|
+
Runs inside Temporal activities (source fetch / destination session), never in a
|
|
11
|
+
workflow, so wall-clock `asyncio.sleep` is fine. Sleeps are capped so a long
|
|
12
|
+
rate-limit window becomes an activity retry (bounded by the activity timeout)
|
|
13
|
+
rather than a single multi-minute blocking sleep.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
|
|
19
|
+
import httpx
|
|
20
|
+
|
|
21
|
+
_MAX_ATTEMPTS = 6
|
|
22
|
+
_BASE_DELAY_SECONDS = 1.0
|
|
23
|
+
_MAX_DELAY_SECONDS = 60.0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _should_retry(resp: httpx.Response, retry_statuses: tuple[int, ...]) -> bool:
|
|
27
|
+
if resp.status_code in retry_statuses:
|
|
28
|
+
return True
|
|
29
|
+
# GitHub signals both its primary and secondary rate limits with 403 plus
|
|
30
|
+
# either a Retry-After or an exhausted X-RateLimit-Remaining. A plain 403
|
|
31
|
+
# (genuine permission failure) has neither and is NOT retried — it surfaces
|
|
32
|
+
# so `is_auth_error` can pause the workflow.
|
|
33
|
+
if resp.status_code == 403 and (
|
|
34
|
+
resp.headers.get("Retry-After")
|
|
35
|
+
or resp.headers.get("X-RateLimit-Remaining") == "0"
|
|
36
|
+
):
|
|
37
|
+
return True
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _retry_delay(resp: httpx.Response, attempt: int, base: float) -> float:
|
|
42
|
+
retry_after = resp.headers.get("Retry-After")
|
|
43
|
+
if retry_after and retry_after.isdigit():
|
|
44
|
+
return min(float(retry_after), _MAX_DELAY_SECONDS)
|
|
45
|
+
return min(base * (2 ** attempt), _MAX_DELAY_SECONDS)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def request_with_retry(
|
|
49
|
+
client: httpx.AsyncClient,
|
|
50
|
+
method: str,
|
|
51
|
+
url: str,
|
|
52
|
+
*,
|
|
53
|
+
headers: dict | None = None,
|
|
54
|
+
params: dict | None = None,
|
|
55
|
+
json: object | None = None,
|
|
56
|
+
max_attempts: int = _MAX_ATTEMPTS,
|
|
57
|
+
base_delay: float = _BASE_DELAY_SECONDS,
|
|
58
|
+
retry_statuses: tuple[int, ...] = (429,),
|
|
59
|
+
) -> httpx.Response:
|
|
60
|
+
"""Issue an httpx request, retrying rate-limited/transient responses with
|
|
61
|
+
backoff that honors `Retry-After`. Returns the final `Response` (this helper
|
|
62
|
+
never raises on HTTP status — the caller decides how to treat 4xx/5xx, since
|
|
63
|
+
e.g. GitHub treats 404 as "skip" and Asana raises). Network errors propagate
|
|
64
|
+
to Temporal, which retries the whole activity."""
|
|
65
|
+
resp = await client.request(method, url, headers=headers, params=params, json=json)
|
|
66
|
+
for attempt in range(max_attempts - 1):
|
|
67
|
+
if not _should_retry(resp, retry_statuses):
|
|
68
|
+
return resp
|
|
69
|
+
await asyncio.sleep(_retry_delay(resp, attempt, base_delay))
|
|
70
|
+
resp = await client.request(method, url, headers=headers, params=params, json=json)
|
|
71
|
+
return resp
|