contextbase-plugin-microsoft-mail 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Iterator
4
+ from dataclasses import dataclass
5
+ from typing import Any
6
+ from urllib.parse import parse_qs, urlparse
7
+
8
+ import dlt
9
+ from shared_plugins.naming import (
10
+ dlt_resource_name,
11
+ dlt_source_name,
12
+ plugin_id_from_module,
13
+ )
14
+ from shared_plugins.resources import ctx_dlt_resource
15
+
16
+ from ..models.ctx import (
17
+ MAIL_FOLDER_COLUMN_DESCRIPTIONS,
18
+ MESSAGE_COLUMN_DESCRIPTIONS,
19
+ MailFolderRow,
20
+ MessageRow,
21
+ )
22
+ from ..models.translators import (
23
+ mail_folder_rows_to_ctx_models,
24
+ message_rows_to_ctx_models,
25
+ )
26
+ from ..utils.client import (
27
+ SyncGraphMailClient,
28
+ graph_object_to_payload,
29
+ )
30
+
31
+ # Known unhandled correctness gaps:
32
+ #
33
+ # 1. Stale messages from removed/hidden folders. `apply_mail_folder_delta_rows`
34
+ # drops folder IDs from the active set when a folder is @removed or
35
+ # transitions to isHidden, and `messages` only iterates the active set.
36
+ # Existing message rows (and their attachment_content rows) under the dropped
37
+ # folder remain in the destination forever — no tombstone path covers this
38
+ # transition.
39
+ #
40
+ # 2. Attachment same-count replacements (lives in sources/attachments.py). The
41
+ # candidate query compares counts of materializable attachments vs. existing
42
+ # attachment_content rows; a message changing [att-1] -> [att-2] looks like
43
+ # 1 == 1 and never re-materializes. The stale file row persists, the new
44
+ # attachment is missing.
45
+
46
+ MAIL_FOLDER_DELTA_CURSOR_URL_KEY = "cursor_url"
47
+ ACTIVE_MAIL_FOLDERS_KEY = "active_folders_by_id"
48
+ MESSAGE_DELTA_CURSOR_URLS_BY_FOLDER_ID_KEY = "cursor_urls_by_folder_id"
49
+
50
+ PLUGIN_ID = plugin_id_from_module(__file__)
51
+ JOB = "sync"
52
+ DELTA_PAGE_SIZE = 100
53
+ DELTA_PREFER_HEADER = f'IdType="ImmutableId", odata.maxpagesize={DELTA_PAGE_SIZE}'
54
+ MAIL_FOLDER_DELTA_QUERY_PARAMS = {
55
+ "$select": [
56
+ "id",
57
+ "displayName",
58
+ "parentFolderId",
59
+ "childFolderCount",
60
+ "totalItemCount",
61
+ "unreadItemCount",
62
+ "isHidden",
63
+ ],
64
+ }
65
+ MESSAGE_DELTA_ORDERBY = ["receivedDateTime desc"]
66
+ INCLUDE_HIDDEN_MAIL_FOLDERS = False
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class MailFolderDeltaDrainResult:
71
+ mail_folder_rows: list[dict[str, Any]]
72
+ active_mail_folders_by_id: dict[str, dict[str, Any]]
73
+ cursor_url: str
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class DeltaPage:
78
+ rows: list[Any]
79
+ cursor_url: str
80
+
81
+
82
+ def is_delta_cursor_url(cursor_url: str) -> bool:
83
+ query = parse_qs(urlparse(cursor_url).query)
84
+ return "$deltatoken" in query or "deltatoken" in query
85
+
86
+
87
+ def drain_delta_pages(
88
+ *,
89
+ initial_cursor_url: str | None,
90
+ fetch_page: Callable[[str | None], Any],
91
+ rows_from_page: Callable[[Any], list[Any]],
92
+ ) -> Iterator[DeltaPage]:
93
+ cursor_url = initial_cursor_url
94
+
95
+ while True:
96
+ response = fetch_page(cursor_url)
97
+ rows = rows_from_page(response)
98
+
99
+ delta_link = getattr(response, "odata_delta_link", None)
100
+ if delta_link:
101
+ yield DeltaPage(rows=rows, cursor_url=delta_link)
102
+ return
103
+
104
+ next_link = getattr(response, "odata_next_link", None)
105
+ if not next_link:
106
+ raise RuntimeError(
107
+ "Graph delta response did not include @odata.nextLink or "
108
+ "@odata.deltaLink"
109
+ )
110
+
111
+ yield DeltaPage(rows=rows, cursor_url=next_link)
112
+ cursor_url = next_link
113
+
114
+
115
+ def message_delta_cursor_urls_for_active_mail_folders(
116
+ previous_message_cursor_urls_by_folder_id: dict[str, str],
117
+ active_mail_folder_ids: list[str],
118
+ ) -> dict[str, str]:
119
+ active_mail_folder_id_set = set(active_mail_folder_ids)
120
+ return {
121
+ folder_id: cursor_url
122
+ for folder_id, cursor_url in previous_message_cursor_urls_by_folder_id.items()
123
+ if folder_id in active_mail_folder_id_set
124
+ }
125
+
126
+
127
+ def message_delta_cursor_urls_with_cursor_for_folder(
128
+ message_cursor_urls_by_folder_id: dict[str, str],
129
+ *,
130
+ folder_id: str,
131
+ cursor_url: str,
132
+ ) -> dict[str, str]:
133
+ return {
134
+ **message_cursor_urls_by_folder_id,
135
+ folder_id: cursor_url,
136
+ }
137
+
138
+
139
+ def should_include_mail_folder_payload(payload: dict[str, Any]) -> bool:
140
+ if payload.get("@removed") is not None:
141
+ return False
142
+ if not INCLUDE_HIDDEN_MAIL_FOLDERS and payload.get("isHidden"):
143
+ return False
144
+ return True
145
+
146
+
147
+ def mail_folder_delta_page_rows(response: Any) -> list[dict[str, Any]]:
148
+ rows: list[dict[str, Any]] = []
149
+ for folder in getattr(response, "value", None) or []:
150
+ row = graph_object_to_payload(folder)
151
+ if not row.get("id"):
152
+ raise RuntimeError(
153
+ f"Graph mail-folder delta row missing id: keys={sorted(row)}"
154
+ )
155
+ rows.append(row)
156
+ return rows
157
+
158
+
159
+ def apply_mail_folder_delta_rows(
160
+ *,
161
+ previous_active_mail_folders_by_id: dict[str, dict[str, Any]],
162
+ mail_folder_rows: list[dict[str, Any]],
163
+ ) -> dict[str, dict[str, Any]]:
164
+ active_mail_folders_by_id = dict(previous_active_mail_folders_by_id)
165
+
166
+ for row in mail_folder_rows:
167
+ folder_id = str(row["id"])
168
+ if should_include_mail_folder_payload(row):
169
+ active_mail_folders_by_id[folder_id] = {
170
+ "id": folder_id,
171
+ "display_name": row.get("displayName"),
172
+ "parent_folder_id": row.get("parentFolderId"),
173
+ "is_hidden": row.get("isHidden"),
174
+ }
175
+ else:
176
+ active_mail_folders_by_id.pop(folder_id, None)
177
+
178
+ return active_mail_folders_by_id
179
+
180
+
181
+ def drain_mail_folder_delta(
182
+ *,
183
+ client: SyncGraphMailClient,
184
+ previous_cursor_url: str | None,
185
+ previous_active_mail_folders_by_id: dict[str, dict[str, Any]],
186
+ ) -> MailFolderDeltaDrainResult:
187
+ mail_folder_rows: list[dict[str, Any]] = []
188
+ drained_cursor_url: str | None = None
189
+
190
+ for page in drain_delta_pages(
191
+ initial_cursor_url=previous_cursor_url,
192
+ fetch_page=lambda cursor_url: client.get_folder_delta_page(
193
+ delta_url=cursor_url,
194
+ query_params=MAIL_FOLDER_DELTA_QUERY_PARAMS,
195
+ prefer_header=DELTA_PREFER_HEADER,
196
+ ),
197
+ rows_from_page=mail_folder_delta_page_rows,
198
+ ):
199
+ mail_folder_rows.extend(page.rows)
200
+ drained_cursor_url = page.cursor_url
201
+
202
+ if drained_cursor_url is None or not is_delta_cursor_url(drained_cursor_url):
203
+ raise RuntimeError("mail folder delta drain did not finish with a delta cursor")
204
+
205
+ return MailFolderDeltaDrainResult(
206
+ mail_folder_rows=mail_folder_rows,
207
+ active_mail_folders_by_id=apply_mail_folder_delta_rows(
208
+ previous_active_mail_folders_by_id=previous_active_mail_folders_by_id,
209
+ mail_folder_rows=mail_folder_rows,
210
+ ),
211
+ cursor_url=drained_cursor_url,
212
+ )
213
+
214
+
215
+ def message_delta_page_rows(
216
+ *,
217
+ response: Any,
218
+ folder_id: str,
219
+ ) -> list[Any]:
220
+ # Don't pre-validate parent_folder_id — @removed rows carry only id +
221
+ # @removed; the translator detects them via additional_data and emits a
222
+ # tombstone using folder_id from context. Live rows with no
223
+ # parent_folder_id fail pydantic validation at MessageRow.parent_folder_id.
224
+ rows: list[Any] = []
225
+ for message in getattr(response, "value", None) or []:
226
+ message_id = getattr(message, "id", None)
227
+ if not message_id:
228
+ raise RuntimeError(
229
+ f"Graph message delta row missing id (folder_id={folder_id!r})"
230
+ )
231
+ rows.append(message)
232
+ return rows
233
+
234
+
235
+ def drain_message_delta_pages_for_folder(
236
+ *,
237
+ client: SyncGraphMailClient,
238
+ folder_id: str,
239
+ previous_cursor_url: str | None,
240
+ initial_message_delta_top: int | None,
241
+ ) -> Iterator[DeltaPage]:
242
+ yield from drain_delta_pages(
243
+ initial_cursor_url=previous_cursor_url,
244
+ fetch_page=lambda cursor_url: client.get_message_delta_page(
245
+ folder_id=folder_id,
246
+ delta_url=cursor_url,
247
+ query_params={
248
+ "$orderby": MESSAGE_DELTA_ORDERBY,
249
+ "$top": initial_message_delta_top,
250
+ "$expand": [
251
+ "attachments($select=id,name,contentType,size,isInline,lastModifiedDateTime)"
252
+ ],
253
+ },
254
+ prefer_header=DELTA_PREFER_HEADER,
255
+ ),
256
+ rows_from_page=lambda response: message_delta_page_rows(
257
+ response=response,
258
+ folder_id=folder_id,
259
+ ),
260
+ )
261
+
262
+
263
+ @dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
264
+ def microsoft_mail_source(
265
+ binding_id: str,
266
+ *,
267
+ client: SyncGraphMailClient,
268
+ initial_message_delta_top: int | None = None,
269
+ ) -> tuple[Any, ...]:
270
+ folder_drain_cache: MailFolderDeltaDrainResult | None = None
271
+
272
+ def get_mail_folder_delta_snapshot() -> MailFolderDeltaDrainResult:
273
+ nonlocal folder_drain_cache
274
+ if folder_drain_cache is not None:
275
+ return folder_drain_cache
276
+
277
+ source_state = dlt.current.source_state()
278
+ previous_cursor_url = source_state.get(MAIL_FOLDER_DELTA_CURSOR_URL_KEY)
279
+ previous_active_mail_folders_by_id = dict(
280
+ source_state.get(ACTIVE_MAIL_FOLDERS_KEY) or {}
281
+ )
282
+
283
+ result = drain_mail_folder_delta(
284
+ client=client,
285
+ previous_cursor_url=previous_cursor_url,
286
+ previous_active_mail_folders_by_id=previous_active_mail_folders_by_id,
287
+ )
288
+
289
+ source_state[MAIL_FOLDER_DELTA_CURSOR_URL_KEY] = result.cursor_url
290
+ source_state[ACTIVE_MAIL_FOLDERS_KEY] = result.active_mail_folders_by_id
291
+
292
+ folder_drain_cache = result
293
+ return result
294
+
295
+ @ctx_dlt_resource(
296
+ name=dlt_resource_name("mail_folders"),
297
+ write_disposition="merge",
298
+ primary_key=("_ctx_binding_id", "id"),
299
+ columns={
300
+ **MAIL_FOLDER_COLUMN_DESCRIPTIONS,
301
+ "_ctx_deleted": {"hard_delete": True},
302
+ },
303
+ )
304
+ def mail_folders() -> Iterator[MailFolderRow]:
305
+ snapshot = get_mail_folder_delta_snapshot()
306
+ yield from mail_folder_rows_to_ctx_models(
307
+ binding_id,
308
+ snapshot.mail_folder_rows,
309
+ )
310
+
311
+ @ctx_dlt_resource(
312
+ name=dlt_resource_name("messages"),
313
+ write_disposition="merge",
314
+ primary_key=("_ctx_binding_id", "id", "parent_folder_id"),
315
+ columns={
316
+ **MESSAGE_COLUMN_DESCRIPTIONS,
317
+ "_ctx_deleted": {"hard_delete": True},
318
+ },
319
+ )
320
+ def messages() -> Iterator[MessageRow]:
321
+ snapshot = get_mail_folder_delta_snapshot()
322
+ active_mail_folder_ids = sorted(snapshot.active_mail_folders_by_id)
323
+
324
+ source_state = dlt.current.source_state()
325
+ previous_message_cursor_urls_by_folder_id = (
326
+ message_delta_cursor_urls_for_active_mail_folders(
327
+ dict(
328
+ source_state.get(MESSAGE_DELTA_CURSOR_URLS_BY_FOLDER_ID_KEY) or {}
329
+ ),
330
+ active_mail_folder_ids,
331
+ )
332
+ )
333
+ updated_message_cursor_urls_by_folder_id = dict(
334
+ previous_message_cursor_urls_by_folder_id,
335
+ )
336
+
337
+ for folder_id in active_mail_folder_ids:
338
+ previous_message_cursor_url = previous_message_cursor_urls_by_folder_id.get(
339
+ folder_id
340
+ )
341
+ message_cursor_url: str | None = previous_message_cursor_url
342
+
343
+ for page in drain_message_delta_pages_for_folder(
344
+ client=client,
345
+ folder_id=folder_id,
346
+ previous_cursor_url=previous_message_cursor_url,
347
+ initial_message_delta_top=initial_message_delta_top,
348
+ ):
349
+ message_cursor_url = page.cursor_url
350
+ yield from message_rows_to_ctx_models(
351
+ binding_id,
352
+ page.rows,
353
+ folder_id=folder_id,
354
+ )
355
+
356
+ if message_cursor_url is None or not is_delta_cursor_url(
357
+ message_cursor_url
358
+ ):
359
+ raise RuntimeError(
360
+ "message delta drain did not finish with a delta cursor"
361
+ )
362
+
363
+ updated_message_cursor_urls_by_folder_id = (
364
+ message_delta_cursor_urls_with_cursor_for_folder(
365
+ updated_message_cursor_urls_by_folder_id,
366
+ folder_id=folder_id,
367
+ cursor_url=message_cursor_url,
368
+ )
369
+ )
370
+
371
+ source_state[MESSAGE_DELTA_CURSOR_URLS_BY_FOLDER_ID_KEY] = (
372
+ updated_message_cursor_urls_by_folder_id
373
+ )
374
+
375
+ return (mail_folders, messages)
@@ -0,0 +1 @@
1
+ """Utilities for Microsoft Mail Graph and DLT spikes."""
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from base64 import b64decode
5
+ from collections.abc import Mapping, Sequence
6
+ from pathlib import PurePosixPath
7
+ from typing import Any
8
+
9
+ from shared_plugins.scratch import replace_scratch_dir_files
10
+
11
+ from ..models.ctx import AttachmentContentRow
12
+ from ..models.translators import attachment_content_row_from_graph_payload
13
+
14
+ REFERENCE_ATTACHMENT_ODATA_TYPE = "#microsoft.graph.referenceAttachment"
15
+ ITEM_ATTACHMENT_ODATA_TYPE = "#microsoft.graph.itemAttachment"
16
+ FILE_ATTACHMENT_ODATA_TYPE = "#microsoft.graph.fileAttachment"
17
+ KNOWN_ATTACHMENT_ODATA_TYPES = frozenset(
18
+ {
19
+ FILE_ATTACHMENT_ODATA_TYPE,
20
+ REFERENCE_ATTACHMENT_ODATA_TYPE,
21
+ ITEM_ATTACHMENT_ODATA_TYPE,
22
+ }
23
+ )
24
+
25
+
26
+ def _hash_path_segment(*parts: str) -> str:
27
+ hash_input = "\n".join(parts)
28
+ return hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
29
+
30
+
31
+ def _build_deterministic_file_name(
32
+ *,
33
+ attachment_id: str,
34
+ name: str | None,
35
+ ) -> str:
36
+ digest = _hash_path_segment(attachment_id, name or "")
37
+ suffix = ""
38
+ if name:
39
+ suffix = "".join(PurePosixPath(name.strip()).suffixes)
40
+ return f"{digest}{suffix}"
41
+
42
+
43
+ def materialize_attachment_payloads(
44
+ *,
45
+ binding_id: str,
46
+ message_id: str,
47
+ attachment_payloads: Sequence[Mapping[str, Any]],
48
+ ) -> list[AttachmentContentRow]:
49
+ """Decode each payload's `contentBytes`, write all of a message's
50
+ attachments into the per-message scratch directory atomically, and return
51
+ one AttachmentContentRow per payload in input order.
52
+
53
+ All payloads must include `id` and `contentBytes` (base64). Reference and
54
+ item attachments are excluded upstream — this helper is for materializable
55
+ file attachments only.
56
+
57
+ Writes are batched into a single `replace_scratch_dir_files` call because
58
+ that helper atomically REPLACES the entire `relative_dir`; per-attachment
59
+ calls would clobber each other within the same message.
60
+ """
61
+ if len(attachment_payloads) == 0:
62
+ return []
63
+
64
+ file_name_by_attachment_id: dict[str, str] = {}
65
+ files: dict[str, bytes] = {}
66
+ for payload in attachment_payloads:
67
+ attachment_id = payload.get("id")
68
+ if not isinstance(attachment_id, str):
69
+ raise RuntimeError(
70
+ "attachment payload missing id "
71
+ f"message_id={message_id} keys={sorted(payload)}"
72
+ )
73
+ content_bytes_b64 = payload.get("contentBytes")
74
+ if not isinstance(content_bytes_b64, str):
75
+ raise RuntimeError(
76
+ "attachment payload missing contentBytes "
77
+ f"message_id={message_id} attachment_id={attachment_id}"
78
+ )
79
+
80
+ file_name = _build_deterministic_file_name(
81
+ attachment_id=attachment_id,
82
+ name=(
83
+ payload.get("name") if isinstance(payload.get("name"), str) else None
84
+ ),
85
+ )
86
+ file_name_by_attachment_id[attachment_id] = file_name
87
+ files[file_name] = b64decode(content_bytes_b64)
88
+
89
+ path_by_file_name = replace_scratch_dir_files(
90
+ binding_id=binding_id,
91
+ relative_dir=f"attachments/{_hash_path_segment(message_id)}",
92
+ files=files,
93
+ )
94
+
95
+ rows: list[AttachmentContentRow] = []
96
+ for payload in attachment_payloads:
97
+ attachment_id = payload["id"]
98
+ file_name = file_name_by_attachment_id[attachment_id]
99
+ rows.append(
100
+ attachment_content_row_from_graph_payload(
101
+ binding_id=binding_id,
102
+ message_id=message_id,
103
+ attachment_payload=payload,
104
+ file_path=path_by_file_name[file_name],
105
+ )
106
+ )
107
+ return rows