durable-sync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. durable_sync/__init__.py +26 -0
  2. durable_sync/activities.py +156 -0
  3. durable_sync/auth/__init__.py +8 -0
  4. durable_sync/auth/oauth/__init__.py +18 -0
  5. durable_sync/auth/oauth/flow.py +183 -0
  6. durable_sync/auth/oauth/refresh.py +58 -0
  7. durable_sync/auth/oauth/store.py +36 -0
  8. durable_sync/auth/oauth/token.py +36 -0
  9. durable_sync/auth/oauth/workflow.py +172 -0
  10. durable_sync/bootstrap.py +44 -0
  11. durable_sync/codec.py +80 -0
  12. durable_sync/config.py +35 -0
  13. durable_sync/connectors/__init__.py +14 -0
  14. durable_sync/connectors/asana/__init__.py +13 -0
  15. durable_sync/connectors/asana/destination.py +213 -0
  16. durable_sync/connectors/content.py +80 -0
  17. durable_sync/connectors/contentful/__init__.py +25 -0
  18. durable_sync/connectors/contentful/api.py +285 -0
  19. durable_sync/connectors/contentful/bootstrap.py +102 -0
  20. durable_sync/connectors/contentful/describe.py +61 -0
  21. durable_sync/connectors/contentful/destination.py +145 -0
  22. durable_sync/connectors/contentful/encode.py +49 -0
  23. durable_sync/connectors/contentful/introspect.py +69 -0
  24. durable_sync/connectors/contentful/mcp.py +95 -0
  25. durable_sync/connectors/contentful/mcp_destination.py +137 -0
  26. durable_sync/connectors/contentful/oauth.py +27 -0
  27. durable_sync/connectors/contentful/prove.py +51 -0
  28. durable_sync/connectors/contentful/source.py +192 -0
  29. durable_sync/connectors/contentful/start.py +46 -0
  30. durable_sync/connectors/contentful/store.py +25 -0
  31. durable_sync/connectors/contentful/token.py +13 -0
  32. durable_sync/connectors/contentful/token_check.py +42 -0
  33. durable_sync/connectors/github/__init__.py +33 -0
  34. durable_sync/connectors/github/api.py +169 -0
  35. durable_sync/connectors/github/source.py +230 -0
  36. durable_sync/connectors/luma/__init__.py +20 -0
  37. durable_sync/connectors/luma/api.py +121 -0
  38. durable_sync/connectors/luma/destination.py +128 -0
  39. durable_sync/connectors/luma/source.py +155 -0
  40. durable_sync/connectors/multi.py +78 -0
  41. durable_sync/connectors/notion/__init__.py +20 -0
  42. durable_sync/connectors/notion/bootstrap.py +97 -0
  43. durable_sync/connectors/notion/client.py +133 -0
  44. durable_sync/connectors/notion/destination.py +270 -0
  45. durable_sync/connectors/notion/oauth.py +25 -0
  46. durable_sync/connectors/notion/prove.py +57 -0
  47. durable_sync/connectors/notion/source.py +136 -0
  48. durable_sync/connectors/notion/start.py +46 -0
  49. durable_sync/connectors/notion/store.py +25 -0
  50. durable_sync/connectors/notion/token.py +13 -0
  51. durable_sync/connectors/youtube/__init__.py +13 -0
  52. durable_sync/connectors/youtube/api.py +122 -0
  53. durable_sync/connectors/youtube/source.py +152 -0
  54. durable_sync/core.py +210 -0
  55. durable_sync/env.py +55 -0
  56. durable_sync/http.py +71 -0
  57. durable_sync/linkstore.py +88 -0
  58. durable_sync/route.py +86 -0
  59. durable_sync/temporal_client.py +48 -0
  60. durable_sync/transport/__init__.py +12 -0
  61. durable_sync/transport/mcp.py +77 -0
  62. durable_sync/worker.py +109 -0
  63. durable_sync/workflows/__init__.py +9 -0
  64. durable_sync/workflows/sync.py +208 -0
  65. durable_sync-0.1.0.dist-info/METADATA +310 -0
  66. durable_sync-0.1.0.dist-info/RECORD +69 -0
  67. durable_sync-0.1.0.dist-info/WHEEL +5 -0
  68. durable_sync-0.1.0.dist-info/licenses/LICENSE +21 -0
  69. durable_sync-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,122 @@
1
+ """YouTube Data API v3 helpers — pure async HTTP + small pure transforms. No
2
+ Temporal, no config globals. Lists a channel's "uploads" playlist (channel id or
3
+ @handle, resolved here), yielding {videoId, title, description, publishedAt,
4
+ viewCount}. Read-only.
5
+
6
+ Docs: https://developers.google.com/youtube/v3/docs
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from datetime import datetime
12
+ from typing import Any
13
+
14
+ import httpx
15
+
16
+ from durable_sync.http import request_with_retry
17
+
18
+ API = "https://www.googleapis.com/youtube/v3"
19
+ PAGE = 50 # API max
20
+ log = logging.getLogger("durable_sync.connectors.youtube")
21
+
22
+
23
+ async def _get(client: httpx.AsyncClient, api_key: str, path: str, params: dict[str, Any]) -> dict[str, Any]:
24
+ r = await request_with_retry(client, "GET", f"{API}/{path}", params={**params, "key": api_key})
25
+ r.raise_for_status()
26
+ return r.json()
27
+
28
+
29
+ def parse_ts(ts: str) -> datetime:
30
+ return datetime.fromisoformat(ts.replace("Z", "+00:00"))
31
+
32
+
33
+ async def uploads_playlist(client: httpx.AsyncClient, api_key: str, channel: str) -> str:
34
+ """Resolve a channel id ('UC…') or '@handle' to its uploads playlist id."""
35
+ ch = channel.strip()
36
+ if ch.startswith("UC") and " " not in ch:
37
+ data = await _get(client, api_key, "channels", {"part": "contentDetails", "id": ch})
38
+ else:
39
+ data = await _get(client, api_key, "channels", {"part": "contentDetails", "forHandle": ch.lstrip("@")})
40
+ items = data.get("items", [])
41
+ if not items:
42
+ raise RuntimeError(f"YouTube channel not found: {channel!r}")
43
+ return items[0]["contentDetails"]["relatedPlaylists"]["uploads"]
44
+
45
+
46
+ async def list_videos_page(
47
+ client: httpx.AsyncClient, api_key: str, playlist: str, after_iso: str, *, page_token: str | None = None
48
+ ) -> tuple[list[dict[str, Any]], str | None]:
49
+ """ONE page of videos published on/after `after_iso` (with view counts), plus
50
+ the next pageToken or None. The uploads playlist is reverse-chronological, so we
51
+ stop (next=None) once a video predates the window — the cursor the spine threads
52
+ through `YouTubeSource.fetch_page`."""
53
+ after = parse_ts(after_iso)
54
+ params: dict[str, Any] = {"part": "snippet,contentDetails", "playlistId": playlist, "maxResults": PAGE}
55
+ if page_token:
56
+ params["pageToken"] = page_token
57
+ data = await _get(client, api_key, "playlistItems", params)
58
+
59
+ metas: list[dict[str, Any]] = []
60
+ stop = False
61
+ for it in data.get("items", []):
62
+ sn, cd = it.get("snippet", {}), it.get("contentDetails", {})
63
+ vid = cd.get("videoId")
64
+ pub = cd.get("videoPublishedAt") or sn.get("publishedAt")
65
+ if not vid or not pub:
66
+ continue
67
+ if parse_ts(pub) < after:
68
+ stop = True
69
+ break
70
+ metas.append({"videoId": vid, "title": sn.get("title", ""),
71
+ "description": sn.get("description", ""), "publishedAt": pub})
72
+
73
+ views = await view_counts(client, api_key, [m["videoId"] for m in metas]) if metas else {}
74
+ for m in metas:
75
+ m["viewCount"] = views.get(m["videoId"])
76
+
77
+ next_token = None if stop else (data.get("nextPageToken") or None)
78
+ return metas, next_token
79
+
80
+
81
+ async def list_videos(client: httpx.AsyncClient, api_key: str, playlist: str, after_iso: str) -> list[dict[str, Any]]:
82
+ """All videos on/after `after_iso` — drains list_videos_page. Non-Temporal
83
+ callers; the spine pages directly."""
84
+ out: list[dict[str, Any]] = []
85
+ page_token: str | None = None
86
+ while True:
87
+ metas, page_token = await list_videos_page(client, api_key, playlist, after_iso, page_token=page_token)
88
+ out.extend(metas)
89
+ if page_token is None:
90
+ return out
91
+
92
+
93
+ async def videos_by_id(client: httpx.AsyncClient, api_key: str, ids: list[str]) -> list[dict[str, Any]]:
94
+ """Specific videos by id (for targeted refreshes), same meta shape as the list."""
95
+ ids = [i for i in ids if i]
96
+ if not ids:
97
+ return []
98
+ data = await _get(client, api_key, "videos", {"part": "snippet,statistics", "id": ",".join(ids[:PAGE])})
99
+ out: list[dict[str, Any]] = []
100
+ for it in data.get("items", []):
101
+ sn, st = it.get("snippet", {}), it.get("statistics", {})
102
+ vc = st.get("viewCount")
103
+ out.append({
104
+ "videoId": it.get("id", ""),
105
+ "title": sn.get("title", ""),
106
+ "description": sn.get("description", ""),
107
+ "publishedAt": sn.get("publishedAt"),
108
+ "viewCount": int(vc) if vc is not None else None,
109
+ })
110
+ return out
111
+
112
+
113
+ async def view_counts(client: httpx.AsyncClient, api_key: str, ids: list[str]) -> dict[str, int | None]:
114
+ """viewCount per video id (one batched call; ids already <= PAGE)."""
115
+ if not ids:
116
+ return {}
117
+ data = await _get(client, api_key, "videos", {"part": "statistics", "id": ",".join(ids)})
118
+ out: dict[str, int | None] = {}
119
+ for it in data.get("items", []):
120
+ vc = it.get("statistics", {}).get("viewCount")
121
+ out[it["id"]] = int(vc) if vc is not None else None
122
+ return out
@@ -0,0 +1,152 @@
1
+ """YouTubeSource — a channel's uploads -> Records, with a source-side enrich hook.
2
+
3
+ YouTube has no per-video author field. The base Record therefore leaves "Author"
4
+ empty and stashes the title + description as a "Scan Text" property (and on the
5
+ enrich context) so an app that needs attribution can scan it for known names —
6
+ an "inverted match" — rather than relying on a structured author. That policy
7
+ lives in your `enrich` hook, not here.
8
+
9
+ Auth: a YouTube Data API v3 key, read from the env var named by
10
+ `YouTubeConfig.token_env`. Requires the `youtube` extra.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import inspect
15
+ import logging
16
+ import os
17
+ from dataclasses import dataclass
18
+ from datetime import datetime, timedelta, timezone
19
+ from typing import Awaitable, Callable, Union
20
+
21
+ import httpx
22
+ from temporalio import activity
23
+
24
+ from durable_sync.core import Record, SourceSpec
25
+ from durable_sync.connectors import content
26
+ from durable_sync.connectors.youtube import api
27
+
28
+ log = logging.getLogger("durable_sync.connectors.youtube")
29
+
30
+ EnrichHook = Callable[[Record, "YouTubeVideoContext"], Union[Record, Awaitable[Record]]]
31
+
32
+ _MAX_SUMMARY = 2000
33
+
34
+
35
+ @dataclass
36
+ class YouTubeConfig:
37
+ """Everything YouTube-specific a deployment supplies. `channel` (required) is a
38
+ channel id ('UC…') or a handle ('@name'); the client resolves a handle to its
39
+ id."""
40
+ channel: str
41
+ token_env: str = "YOUTUBE_API_KEY"
42
+ lookback_days: int = 21
43
+ interval_minutes: int = 360
44
+ title_property: str = "Name"
45
+ item_type: str = "Video" # value written to the neutral "Type" column
46
+
47
+
48
+ @dataclass
49
+ class YouTubeVideoContext:
50
+ """Handed to the enrich hook: the raw video meta (incl. full description) plus
51
+ the live client, so enrich can do inverted name-matching or extra lookups."""
52
+ raw_video: dict
53
+ scan_text: str # title + description, for inverted matching
54
+ client: httpx.AsyncClient
55
+ api_key: str
56
+
57
+
58
+ def _heartbeat(detail: str) -> None:
59
+ if activity.in_activity():
60
+ activity.heartbeat(detail)
61
+
62
+
63
+ class YouTubeSource:
64
+ name = "youtube"
65
+
66
+ def __init__(self, config: YouTubeConfig, *, enrich: EnrichHook | None = None):
67
+ self._config = config
68
+ self._enrich = enrich
69
+
70
+ def specs(self) -> list[SourceSpec]:
71
+ ch = self._config.channel
72
+ return [SourceSpec(key=f"channel:{ch}", interval_minutes=self._config.interval_minutes,
73
+ params={"channel": ch})]
74
+
75
+ async def fetch_page(
76
+ self, spec: SourceSpec, only_items: list[str] | None, cursor: str | None
77
+ ) -> tuple[list[Record], str | None]:
78
+ """ONE page of videos + next cursor (None on the last page). The cursor
79
+ carries the frozen window start, the resolved uploads playlist (so we don't
80
+ re-resolve the channel each page), and YouTube's pageToken. A targeted
81
+ (`only_items`) refresh is bounded, so it returns a single page."""
82
+ cfg = self._config
83
+ api_key = os.environ.get(cfg.token_env, "")
84
+ channel = spec.params.get("channel", cfg.channel)
85
+
86
+ async with httpx.AsyncClient(timeout=30) as client:
87
+ if only_items:
88
+ videos = await api.videos_by_id(client, api_key, only_items)
89
+ next_cursor = None
90
+ else:
91
+ if cursor is None:
92
+ after_iso = (datetime.now(timezone.utc) - timedelta(days=cfg.lookback_days)).isoformat()
93
+ playlist = await api.uploads_playlist(client, api_key, channel)
94
+ page_token = None
95
+ else:
96
+ c = content.unpack_cursor(cursor)
97
+ after_iso, playlist, page_token = c["after"], c["playlist"], c["token"]
98
+ videos, next_token = await api.list_videos_page(
99
+ client, api_key, playlist, after_iso, page_token=page_token)
100
+ next_cursor = (
101
+ content.pack_cursor(after=after_iso, playlist=playlist, token=next_token)
102
+ if next_token else None
103
+ )
104
+
105
+ out: list[Record] = []
106
+ for v in videos:
107
+ record = self._to_record(v)
108
+ if self._enrich is not None:
109
+ title = v.get("title") or ""
110
+ scan_text = f"{title}\n{v.get('description', '')}"
111
+ ctx = YouTubeVideoContext(raw_video=v, scan_text=scan_text, client=client, api_key=api_key)
112
+ result = self._enrich(record, ctx)
113
+ record = await result if inspect.isawaitable(result) else result
114
+ out.append(record)
115
+ _heartbeat(v.get("videoId", ""))
116
+
117
+ log.info("Fetched %d YouTube videos for %s (cursor=%s)", len(out), spec.key, cursor)
118
+ return out, next_cursor
119
+
120
+ async def fetch(self, spec: SourceSpec, only_items: list[str] | None = None) -> list[Record]:
121
+ """Whole window as one list — drains fetch_page (standalone/non-Temporal)."""
122
+ records: list[Record] = []
123
+ cursor: str | None = None
124
+ while True:
125
+ page, cursor = await self.fetch_page(spec, only_items, cursor)
126
+ records.extend(page)
127
+ if cursor is None:
128
+ return records
129
+
130
+ def _to_record(self, v: dict) -> Record:
131
+ """Map one video to a neutral Record. Pure (no IO)."""
132
+ cfg = self._config
133
+ vid = v.get("videoId", "")
134
+ title = v.get("title") or "(untitled video)"
135
+ description = v.get("description") or ""
136
+ return content.content_record(
137
+ primary_key=vid,
138
+ title_property=cfg.title_property,
139
+ title=title,
140
+ item_type=cfg.item_type,
141
+ source="YouTube",
142
+ url=f"https://www.youtube.com/watch?v={vid}" if vid else None,
143
+ date=v.get("publishedAt"),
144
+ status="Published",
145
+ author="", # no per-video author on YouTube
146
+ extra={
147
+ "Reach": v.get("viewCount"),
148
+ "Summary": description[:_MAX_SUMMARY],
149
+ # Free text for inverted matching by an app's enrich/transform hook.
150
+ "Scan Text": f"{title}\n{description}"[:_MAX_SUMMARY],
151
+ },
152
+ )
durable_sync/core.py ADDED
@@ -0,0 +1,210 @@
1
+ """Generic, source/destination-agnostic spine. No I/O here — this module is
2
+ imported into the Temporal workflow sandbox, so it must stay side-effect-free.
3
+
4
+ The whole library reduces to two seams:
5
+
6
+ * a Source produces `Record`s (fetch + map your data),
7
+ * a Destination upserts them idempotently.
8
+
9
+ Everything painful — durable orchestration, idempotent upsert, OAuth refresh,
10
+ pagination, rate-limit backoff, error handling — lives in the spine and is
11
+ inherited for free. To add a source you implement `Source`; to add a
12
+ destination, `Destination`. Reference implementations: GitHub (source), Notion
13
+ and Asana (destinations).
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import datetime as dt
18
+ import re
19
+ from dataclasses import dataclass, field
20
+ from typing import Any, AsyncContextManager, Protocol, runtime_checkable
21
+
22
+
23
+ @dataclass
24
+ class Record:
25
+ """One row to upsert, in DESTINATION-AGNOSTIC form. `properties` values are
26
+ NEUTRAL Python types — the Destination owns wire-encoding, so a Source author
27
+ never learns a destination's quirks (Notion's multi-select JSON, Asana's
28
+ custom fields, etc.):
29
+
30
+ str -> text / url / select / title
31
+ bool -> checkbox
32
+ int | float -> number
33
+ list[str] -> multi-select
34
+ datetime.date -> date datetime.datetime -> datetime
35
+ None -> property omitted
36
+
37
+ `primary_key` is the IMMUTABLE idempotency key (e.g. a repo id), never a
38
+ name/URL — this is what makes at-least-once retries safe. `body` is optional
39
+ long-form content (e.g. a README / task notes), written on create.
40
+ """
41
+ primary_key: str
42
+ properties: dict[str, Any]
43
+ body: str | None = None
44
+
45
+
46
+ @dataclass
47
+ class SourceSpec:
48
+ """One unit of work for a Source, handed to its per-source entity workflow.
49
+ `key` is a stable id used to derive the workflow id. `params` is opaque,
50
+ source-defined config (e.g. {"kind": "org", "org": "temporal-community"})."""
51
+ key: str
52
+ interval_minutes: int = 30
53
+ params: dict[str, Any] = field(default_factory=dict)
54
+
55
+
56
+ @runtime_checkable
57
+ class Source(Protocol):
58
+ """Implement this for your data source. GitHubSource is the reference impl."""
59
+ name: str
60
+
61
+ def specs(self) -> list[SourceSpec]:
62
+ """One SourceSpec per independent unit (each gets its own workflow)."""
63
+ ...
64
+
65
+ async def fetch(
66
+ self, spec: SourceSpec, only_items: list[str] | None = None
67
+ ) -> list[Record]:
68
+ """Fetch (optionally just `only_items`) and map to Records. All
69
+ source-specific I/O and field-mapping happens here. Returns the WHOLE unit
70
+ in one list — simplest, and fine up to ~hundreds of records.
71
+
72
+ For a source that can return many thousands, implement `fetch_page` (below)
73
+ instead: the spine drives it page-by-page so neither the fetch result nor
74
+ the upsert ever passes through Temporal history as one oversized payload."""
75
+ ...
76
+
77
+ # OPTIONAL (checked via getattr by the spine — like the Destination's aux hooks),
78
+ # but PREFERRED: every shipped connector (GitHub/Luma/YouTube/Contentful)
79
+ # implements it, with fetch() as a thin drain over it. The spine calls it
80
+ # repeatedly, threading your cursor, and upserts each page before asking for the
81
+ # next — bounding history regardless of total size. When present it's used in
82
+ # preference to fetch(). A genuinely tiny source can skip it and just implement
83
+ # fetch() (the spine treats that as one page).
84
+ #
85
+ # async def fetch_page(
86
+ # self, spec: SourceSpec, only_items: list[str] | None, cursor: str | None
87
+ # ) -> tuple[list[Record], str | None]:
88
+ # """Return (records_for_this_page, next_cursor). next_cursor is None on
89
+ # the last page. `cursor` is None on the first call. Opaque to the spine —
90
+ # use whatever your API's pagination token is (offset, page no, cursor)."""
91
+
92
+
93
+ class DestinationSession(Protocol):
94
+ """An open connection to the destination for one sync pass."""
95
+
96
+ async def query_existing_ids(self) -> dict[str, str]:
97
+ """{ primary_key -> destination-internal id } for rows already present."""
98
+ ...
99
+
100
+ async def create(self, record: Record, synced_at: dt.datetime) -> bool:
101
+ """Insert a new row. `synced_at` is the sync-pass timestamp (a real
102
+ datetime — the destination formats it however its schema needs).
103
+ Returns True if written, False if SKIPPED (e.g. a destination-side enrich
104
+ hook dropped the record as out-of-scope)."""
105
+ ...
106
+
107
+ async def update(self, existing_id: str, record: Record, synced_at: dt.datetime) -> bool:
108
+ """Refresh an existing row, leaving `create_only` properties untouched so
109
+ human edits to those seeds survive. `synced_at` as in create(). Returns
110
+ True if written, False if skipped."""
111
+ ...
112
+
113
+
114
+ class Destination(Protocol):
115
+ """Implement this for your destination. NotionDestination / AsanaDestination
116
+ are the reference impls (MCP+OAuth and REST+PAT respectively — the protocol is
117
+ intentionally neither transport- nor auth-shaped)."""
118
+ name: str
119
+
120
+ # True once the destination has the config it needs to write (e.g. a target
121
+ # id). The spine refuses to sync an unconfigured destination.
122
+ configured: bool
123
+
124
+ # Properties written only on CREATE — enrichment seeds a human refines, never
125
+ # overwritten on update. The mechanism is generic; each Source supplies which
126
+ # fields. Honored by update().
127
+ create_only_properties: set[str]
128
+
129
+ def connect(self) -> AsyncContextManager[DestinationSession]: ...
130
+
131
+ # OPTIONAL hooks (checked via getattr by the worker — don't define if unused):
132
+ # def aux_workflows(self) -> list: ... extra Temporal workflows to register
133
+ # def aux_activities(self) -> list: ... extra activities to register
134
+ # e.g. the Notion destination registers its token-owner auth workflow here.
135
+
136
+ @property
137
+ def config_hint(self) -> str:
138
+ """Human-readable hint naming what to set when `configured` is False
139
+ (e.g. an env var). Keeps destination-specific config names out of the
140
+ generic spine's error messages."""
141
+ ...
142
+
143
+ @staticmethod
144
+ def is_auth_error(err: BaseException) -> bool:
145
+ """True if `err` is an auth failure only a human can fix (so the workflow
146
+ pauses instead of hammering). Destination-specific. OPTIONAL: destinations
147
+ with no interactive auth (e.g. a local DB) should just `return False`.
148
+ Most HTTP destinations can delegate to `auth_error_in_chain` below."""
149
+ ...
150
+
151
+
152
+ class DestinationHTTPError(RuntimeError):
153
+ """An HTTP error from a destination, carrying the numeric `status_code`
154
+ SEPARATELY from the message. `auth_error_in_chain` keys auth-classification on
155
+ this code rather than scanning the (up-to-600-char) response body — where a
156
+ stray standalone "403" in an error payload would spuriously pause the workflow.
157
+ Destinations should raise this (not a bare RuntimeError) for HTTP failures."""
158
+
159
+ def __init__(self, status_code: int, message: str) -> None:
160
+ super().__init__(message)
161
+ self.status_code = status_code
162
+
163
+
164
+ # Default auth-failure signatures shared by HTTP destinations. The numeric code is
165
+ # taken from a DestinationHTTPError.status_code when present; otherwise (a plain
166
+ # exception) we fall back to a WORD-BOUNDARY text match so a bare "401"/"403"
167
+ # inside a UUID or request-id can't false-positive — the bug that once paused a
168
+ # workflow on a Notion validation_error whose id contained "401e".
169
+ _AUTH_TEXT_NEEDLES = ("unauthorized", "forbidden", "invalid_token", "invalid_grant")
170
+ _AUTH_STATUS_CODES = (401, 403)
171
+ _AUTH_CODE_RE = re.compile(r"\b(401|403)\b")
172
+
173
+
174
+ def auth_error_in_chain(err: BaseException, *, extra_needles: tuple[str, ...] = ()) -> bool:
175
+ """Shared `is_auth_error` implementation: walk `err`'s cause/context chain and
176
+ any ExceptionGroup, returning True if any link looks like a human-fixable auth
177
+ failure (401/403, unauthorized, forbidden, invalid_token/grant). A destination
178
+ passes `extra_needles` for service-specific phrasings (e.g. Asana's "not
179
+ authorized"). Pure/deterministic — no I/O — so it's safe to import widely.
180
+
181
+ Classification order per link: (1) a DestinationHTTPError's exact status_code;
182
+ (2) a text needle; (3) ONLY when the link carries no status_code, a
183
+ word-boundary 401/403 in the message. (3) is skipped for status-carrying errors
184
+ so a 422/500 whose body mentions "403" isn't misread as auth.
185
+
186
+ This lives in the spine so every destination shares ONE correct matcher
187
+ instead of re-deriving the chain walk + code check (which is exactly where
188
+ Notion and Asana had drifted apart)."""
189
+ needles = _AUTH_TEXT_NEEDLES + tuple(n.lower() for n in extra_needles)
190
+ seen: set[int] = set()
191
+ stack: list[BaseException] = [err]
192
+ while stack:
193
+ cur = stack.pop()
194
+ if id(cur) in seen:
195
+ continue
196
+ seen.add(id(cur))
197
+ status = getattr(cur, "status_code", None)
198
+ msg = str(cur).lower()
199
+ if status in _AUTH_STATUS_CODES:
200
+ return True
201
+ if any(n in msg for n in needles):
202
+ return True
203
+ if status is None and _AUTH_CODE_RE.search(msg):
204
+ return True
205
+ if isinstance(cur, BaseExceptionGroup):
206
+ stack.extend(cur.exceptions)
207
+ for nxt in (cur.__cause__, cur.__context__):
208
+ if nxt is not None:
209
+ stack.append(nxt)
210
+ return False
durable_sync/env.py ADDED
@@ -0,0 +1,55 @@
1
+ """Load a local `.env` into os.environ — dev convenience for `python -m …` tools
2
+ and the live smokes, so each script doesn't roll its own (which led to scripts
3
+ that silently ignored a populated `.env`).
4
+
5
+ Idempotent, never overrides an already-set var, no-op if there's no `.env`. Uses
6
+ python-dotenv when present (the `dev` extra); falls back to a tiny built-in parser
7
+ so it still works in a minimal install. Run scripts from the repo root.
8
+
9
+ NOT imported by config.py / the workflow sandbox — this does file IO, so it stays
10
+ out of the deterministic path and is called explicitly by scripts only.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import stat
16
+ import sys
17
+ from pathlib import Path
18
+
19
+
20
+ def load_env(path: str | os.PathLike | None = None) -> None:
21
+ _warn_if_world_readable(Path(path) if path else Path(".env"))
22
+ try:
23
+ from dotenv import load_dotenv
24
+ except ModuleNotFoundError:
25
+ _load_fallback(path)
26
+ return
27
+ load_dotenv(path) if path else load_dotenv()
28
+
29
+
30
+ def _warn_if_world_readable(p: Path) -> None:
31
+ """The `.env` is the documented home for DURABLE_SYNC_ENC_KEY (the AES master
32
+ key) and connector PATs. The token JSON stores are chmod'd 0o600; the `.env`
33
+ must be too, or a local user reads the key that decrypts every token. Warn
34
+ (don't fail — a CI runner with injected env vars may have no `.env`)."""
35
+ try:
36
+ mode = p.stat().st_mode
37
+ except OSError:
38
+ return # no file / not readable — nothing loaded from it
39
+ if mode & (stat.S_IRWXG | stat.S_IRWXO):
40
+ print(
41
+ f"WARNING: {p} is group/other-accessible (mode {oct(mode & 0o777)}); "
42
+ f"it may hold secrets (DURABLE_SYNC_ENC_KEY, PATs). Run: chmod 600 {p}",
43
+ file=sys.stderr,
44
+ )
45
+
46
+
47
+ def _load_fallback(path: str | os.PathLike | None) -> None:
48
+ p = Path(path) if path else Path(".env")
49
+ if not p.exists():
50
+ return
51
+ for line in p.read_text().splitlines():
52
+ line = line.strip()
53
+ if line and not line.startswith("#") and "=" in line:
54
+ k, v = line.split("=", 1)
55
+ os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
durable_sync/http.py ADDED
@@ -0,0 +1,71 @@
1
+ """Shared httpx retry/backoff for REST sources & destinations.
2
+
3
+ The destinations and the GitHub source all talk HTTP and all need the same
4
+ manners under rate limiting (honor `Retry-After`, otherwise exponential backoff).
5
+ That logic had drifted — Asana honored `Retry-After`, GitHub had no backoff at
6
+ all — so it lives here once. NOT used by the Notion destination: the MCP
7
+ transport surfaces failures as `isError` *results* rather than HTTP statuses, so
8
+ it keeps its own small retry loop in `NotionDestination.call`.
9
+
10
+ Runs inside Temporal activities (source fetch / destination session), never in a
11
+ workflow, so wall-clock `asyncio.sleep` is fine. Sleeps are capped so a long
12
+ rate-limit window becomes an activity retry (bounded by the activity timeout)
13
+ rather than a single multi-minute blocking sleep.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+
19
+ import httpx
20
+
21
+ _MAX_ATTEMPTS = 6
22
+ _BASE_DELAY_SECONDS = 1.0
23
+ _MAX_DELAY_SECONDS = 60.0
24
+
25
+
26
+ def _should_retry(resp: httpx.Response, retry_statuses: tuple[int, ...]) -> bool:
27
+ if resp.status_code in retry_statuses:
28
+ return True
29
+ # GitHub signals both its primary and secondary rate limits with 403 plus
30
+ # either a Retry-After or an exhausted X-RateLimit-Remaining. A plain 403
31
+ # (genuine permission failure) has neither and is NOT retried — it surfaces
32
+ # so `is_auth_error` can pause the workflow.
33
+ if resp.status_code == 403 and (
34
+ resp.headers.get("Retry-After")
35
+ or resp.headers.get("X-RateLimit-Remaining") == "0"
36
+ ):
37
+ return True
38
+ return False
39
+
40
+
41
+ def _retry_delay(resp: httpx.Response, attempt: int, base: float) -> float:
42
+ retry_after = resp.headers.get("Retry-After")
43
+ if retry_after and retry_after.isdigit():
44
+ return min(float(retry_after), _MAX_DELAY_SECONDS)
45
+ return min(base * (2 ** attempt), _MAX_DELAY_SECONDS)
46
+
47
+
48
+ async def request_with_retry(
49
+ client: httpx.AsyncClient,
50
+ method: str,
51
+ url: str,
52
+ *,
53
+ headers: dict | None = None,
54
+ params: dict | None = None,
55
+ json: object | None = None,
56
+ max_attempts: int = _MAX_ATTEMPTS,
57
+ base_delay: float = _BASE_DELAY_SECONDS,
58
+ retry_statuses: tuple[int, ...] = (429,),
59
+ ) -> httpx.Response:
60
+ """Issue an httpx request, retrying rate-limited/transient responses with
61
+ backoff that honors `Retry-After`. Returns the final `Response` (this helper
62
+ never raises on HTTP status — the caller decides how to treat 4xx/5xx, since
63
+ e.g. GitHub treats 404 as "skip" and Asana raises). Network errors propagate
64
+ to Temporal, which retries the whole activity."""
65
+ resp = await client.request(method, url, headers=headers, params=params, json=json)
66
+ for attempt in range(max_attempts - 1):
67
+ if not _should_retry(resp, retry_statuses):
68
+ return resp
69
+ await asyncio.sleep(_retry_delay(resp, attempt, base_delay))
70
+ resp = await client.request(method, url, headers=headers, params=params, json=json)
71
+ return resp