contextbase-plugin-gmail 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,470 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import binascii
5
+ import codecs
6
+ import re
7
+ from collections.abc import Iterable, Iterator, Mapping
8
+ from datetime import datetime, timezone
9
+ from email.utils import parsedate_to_datetime
10
+ from typing import Any
11
+ from urllib.parse import unquote
12
+
13
+ from shared_plugins.values import non_empty_string
14
+
15
+ from .ctx import (
16
+ AttachmentCandidateProjection,
17
+ HistoryEventRow,
18
+ LabelRow,
19
+ MessageRow,
20
+ ProfileRow,
21
+ )
22
+ from .ingress import (
23
+ GmailAttachmentIngress,
24
+ GmailHistoryRecordIngress,
25
+ GmailLabelIngress,
26
+ GmailMessageIngress,
27
+ GmailMessagePartHeaderIngress,
28
+ GmailMessagePartIngress,
29
+ GmailProfileIngress,
30
+ )
31
+
32
+ _SPECIAL_ATTACHMENT_MIME_TYPES = {
33
+ "message/delivery-status",
34
+ "message/rfc822",
35
+ }
36
+ _SUPPORTED_BODY_MIME_TYPES = {
37
+ "text/plain",
38
+ "text/html",
39
+ }
40
+
41
+
42
+ class Base64UrlDecodeError(ValueError):
43
+ """Raised when a non-empty base64url payload cannot be decoded."""
44
+
45
+
46
+ def utc_now_iso() -> str:
47
+ return datetime.now(timezone.utc).isoformat()
48
+
49
+
50
+ def profiles_to_ctx_models(
51
+ binding_id: str,
52
+ profiles: Iterable[GmailProfileIngress],
53
+ ) -> Iterator[ProfileRow]:
54
+ for profile in profiles:
55
+ yield ProfileRow(
56
+ ctx_binding_id=binding_id,
57
+ email_address=profile.email_address,
58
+ messages_total=profile.messages_total,
59
+ threads_total=profile.threads_total,
60
+ history_id=profile.history_id,
61
+ )
62
+
63
+
64
+ def labels_to_ctx_models(
65
+ binding_id: str,
66
+ labels: Iterable[GmailLabelIngress],
67
+ ) -> Iterator[LabelRow]:
68
+ for label in labels:
69
+ yield LabelRow(
70
+ ctx_binding_id=binding_id,
71
+ id=label.id,
72
+ name=label.name,
73
+ type=label.type,
74
+ message_list_visibility=label.message_list_visibility,
75
+ label_list_visibility=label.label_list_visibility,
76
+ messages_total=label.messages_total,
77
+ messages_unread=label.messages_unread,
78
+ threads_total=label.threads_total,
79
+ threads_unread=label.threads_unread,
80
+ color=label.color.model_dump(by_alias=True) if label.color else None,
81
+ )
82
+
83
+
84
+ def history_events_to_ctx_models(
85
+ binding_id: str,
86
+ history_events: Iterable[GmailHistoryRecordIngress],
87
+ ) -> Iterator[HistoryEventRow]:
88
+ for event in history_events:
89
+ raw_event = event.model_dump(by_alias=True)
90
+ yield HistoryEventRow(
91
+ ctx_binding_id=binding_id,
92
+ id=event.id,
93
+ messages=raw_event.get("messages", []),
94
+ messages_added=raw_event.get("messagesAdded", []),
95
+ messages_deleted=raw_event.get("messagesDeleted", []),
96
+ labels_added=raw_event.get("labelsAdded", []),
97
+ labels_removed=raw_event.get("labelsRemoved", []),
98
+ )
99
+
100
+
101
+ def messages_to_ctx_models(
102
+ binding_id: str,
103
+ messages: Iterable[GmailMessageIngress],
104
+ ) -> Iterator[MessageRow]:
105
+ for message in messages:
106
+ payload = message.payload
107
+ headers = payload.headers if payload else []
108
+ body_text, body_html = _extract_bodies(payload, message_id=message.id)
109
+ attachments = _extract_attachments(payload)
110
+
111
+ yield MessageRow(
112
+ ctx_binding_id=binding_id,
113
+ ctx_source_updated_at=_internal_date_to_datetime(message.internal_date),
114
+ id=message.id,
115
+ thread_id=message.thread_id,
116
+ label_ids=list(message.label_ids),
117
+ snippet=message.snippet,
118
+ history_id=message.history_id,
119
+ internal_date=message.internal_date,
120
+ size_estimate=message.size_estimate,
121
+ subject=_extract_header(headers, "Subject"),
122
+ from_address=_extract_header(headers, "From"),
123
+ to_addresses=_extract_header(headers, "To"),
124
+ cc_addresses=_extract_header(headers, "Cc"),
125
+ bcc_addresses=_extract_header(headers, "Bcc"),
126
+ reply_to=_extract_header(headers, "Reply-To"),
127
+ message_id_header=_extract_header(headers, "Message-Id"),
128
+ in_reply_to=_extract_header(headers, "In-Reply-To"),
129
+ references_header=_extract_header(headers, "References"),
130
+ date=_parse_date_header(_extract_header(headers, "Date")),
131
+ body_text=body_text,
132
+ body_html=body_html,
133
+ mime_type=payload.mime_type if payload else None,
134
+ classification_label_values=list(message.classification_label_values),
135
+ attachment_count=len(attachments),
136
+ attachments=[item.model_dump(exclude_none=True) for item in attachments],
137
+ )
138
+
139
+
140
+ def attachment_candidate_rows_to_ctx_models(
141
+ rows: Iterable[Mapping[str, Any]],
142
+ ) -> Iterator[AttachmentCandidateProjection]:
143
+ for row in rows:
144
+ yield AttachmentCandidateProjection.model_validate(dict(row))
145
+
146
+
147
+ def _extract_header(
148
+ headers: list[GmailMessagePartHeaderIngress],
149
+ name: str,
150
+ ) -> str | None:
151
+ target_name = name.casefold()
152
+ for header in headers:
153
+ if header.name.casefold() == target_name:
154
+ return header.value
155
+ return None
156
+
157
+
158
+ def decode_base64url_bytes(value: str | None) -> bytes | None:
159
+ if value is None or not isinstance(value, str):
160
+ return None
161
+
162
+ encoded = value.strip()
163
+ if encoded == "":
164
+ return b""
165
+
166
+ padded = encoded + "=" * ((4 - len(encoded) % 4) % 4)
167
+ try:
168
+ return base64.b64decode(padded, altchars=b"-_", validate=True)
169
+ except (binascii.Error, ValueError) as exc:
170
+ raise Base64UrlDecodeError("Invalid base64url payload.") from exc
171
+
172
+
173
+ def _resolve_mime_charset(charset: str | None) -> str:
174
+ normalized = non_empty_string(charset)
175
+ if normalized is None:
176
+ return "utf-8"
177
+
178
+ try:
179
+ return codecs.lookup(normalized).name
180
+ except LookupError as exc:
181
+ raise ValueError(f"Unsupported MIME charset: {normalized}") from exc
182
+
183
+
184
+ def _decode_base64url_text(
185
+ value: str | None,
186
+ *,
187
+ charset: str | None = None,
188
+ ) -> str | None:
189
+ decoded = decode_base64url_bytes(value)
190
+ if decoded is None or decoded == b"":
191
+ return None
192
+ return decoded.decode(_resolve_mime_charset(charset), errors="replace")
193
+
194
+
195
+ def _walk_mime_parts(
196
+ part: GmailMessagePartIngress,
197
+ *,
198
+ mime_path: str,
199
+ ) -> list[tuple[GmailMessagePartIngress, str]]:
200
+ walked: list[tuple[GmailMessagePartIngress, str]] = [(part, mime_path)]
201
+ for index, child in enumerate(part.parts):
202
+ child_path = f"{mime_path}.{index}"
203
+ walked.extend(_walk_mime_parts(child, mime_path=child_path))
204
+ return walked
205
+
206
+
207
+ def _extract_bodies(
208
+ part: GmailMessagePartIngress | None,
209
+ *,
210
+ message_id: str | None = None,
211
+ ) -> tuple[str | None, str | None]:
212
+ if part is None:
213
+ return None, None
214
+
215
+ text_chunks: list[str] = []
216
+ html_chunks: list[str] = []
217
+
218
+ for mime_part, mime_path in _walk_mime_parts(part, mime_path="0"):
219
+ mime_type = non_empty_string(mime_part.mime_type)
220
+ if mime_type not in _SUPPORTED_BODY_MIME_TYPES:
221
+ continue
222
+
223
+ body_data = mime_part.body.data if mime_part.body else None
224
+ charset = _extract_mime_part_charset(mime_part.headers)
225
+
226
+ try:
227
+ decoded = _decode_base64url_text(body_data, charset=charset)
228
+ except (Base64UrlDecodeError, ValueError) as exc:
229
+ raise RuntimeError(
230
+ "Failed to decode Gmail MIME body "
231
+ f"(message_id={message_id or '-'}, mime_path={mime_path}, "
232
+ f"mime_type={mime_type}, charset={charset or 'utf-8'}): {exc}"
233
+ ) from exc
234
+
235
+ if decoded is None:
236
+ continue
237
+
238
+ if mime_type == "text/plain":
239
+ text_chunks.append(decoded)
240
+ elif mime_type == "text/html":
241
+ html_chunks.append(decoded)
242
+
243
+ body_text = "\n".join(text_chunks).strip() if text_chunks else None
244
+ body_html = "\n".join(html_chunks).strip() if html_chunks else None
245
+ return body_text or None, body_html or None
246
+
247
+
248
+ def _normalize_attachment_filename(value: str | None) -> str | None:
249
+ normalized = non_empty_string(value)
250
+ if normalized in {None, ".", ".."}:
251
+ return None
252
+ if normalized is not None and (
253
+ "/" in normalized or "\\" in normalized or "\x00" in normalized
254
+ ):
255
+ return None
256
+ return normalized
257
+
258
+
259
+ def _decode_header_value(value: str) -> str:
260
+ cleaned = value.strip()
261
+ if (
262
+ (cleaned.startswith('"') and cleaned.endswith('"'))
263
+ or (cleaned.startswith("'") and cleaned.endswith("'"))
264
+ ) and len(cleaned) >= 2:
265
+ cleaned = cleaned[1:-1]
266
+ return cleaned
267
+
268
+
269
+ def _decode_extended_header_value(value: str) -> str:
270
+ cleaned = _decode_header_value(value)
271
+ parts = cleaned.split("''", maxsplit=1)
272
+ encoded = parts[1] if len(parts) == 2 else cleaned
273
+ return unquote(encoded)
274
+
275
+
276
+ def _extract_header_param(value: str, parameter: str) -> str | None:
277
+ star_match = re.search(rf"{re.escape(parameter)}\\*=([^;]+)", value, re.IGNORECASE)
278
+ if star_match:
279
+ return _decode_extended_header_value(star_match.group(1))
280
+
281
+ match = re.search(rf"{re.escape(parameter)}=([^;]+)", value, re.IGNORECASE)
282
+ if match:
283
+ return _decode_header_value(match.group(1))
284
+
285
+ return None
286
+
287
+
288
+ def _extract_mime_part_charset(
289
+ headers: list[GmailMessagePartHeaderIngress],
290
+ ) -> str | None:
291
+ content_type = _extract_header(headers, "Content-Type")
292
+ if content_type is None:
293
+ return None
294
+ return non_empty_string(_extract_header_param(content_type, "charset"))
295
+
296
+
297
+ def _extract_filename_from_part_headers(headers: Mapping[str, str]) -> str | None:
298
+ content_disposition = headers.get("content-disposition")
299
+ if content_disposition:
300
+ filename = _extract_header_param(content_disposition, "filename")
301
+ if filename:
302
+ return filename
303
+
304
+ content_type = headers.get("content-type")
305
+ if content_type:
306
+ filename = _extract_header_param(content_type, "name")
307
+ if filename:
308
+ return filename
309
+
310
+ content_description = headers.get("content-description")
311
+ if content_description:
312
+ return content_description
313
+
314
+ return None
315
+
316
+
317
+ def _extract_part_headers(
318
+ headers: list[GmailMessagePartHeaderIngress],
319
+ ) -> dict[str, str]:
320
+ return {header.name.lower(): header.value for header in headers}
321
+
322
+
323
+ def _normalize_content_id(value: str | None) -> str | None:
324
+ content_id = non_empty_string(value)
325
+ if content_id is None:
326
+ return None
327
+ return content_id.removeprefix("<").removesuffix(">") or None
328
+
329
+
330
+ def _normalize_content_disposition(value: str | None) -> str | None:
331
+ disposition = non_empty_string(value)
332
+ if disposition is None:
333
+ return None
334
+ head = disposition.split(";", maxsplit=1)[0].strip()
335
+ return head or None
336
+
337
+
338
+ def canonicalize_part_id(
339
+ *,
340
+ source_part_id: str | None,
341
+ attachment_id: str | None,
342
+ mime_path: str,
343
+ ) -> str:
344
+ part_id = non_empty_string(source_part_id)
345
+ if part_id is not None:
346
+ return part_id
347
+ if attachment_id is not None:
348
+ return f"aid:{attachment_id}"
349
+ return f"path:{mime_path}"
350
+
351
+
352
+ def _extract_attachments(
353
+ part: GmailMessagePartIngress | None,
354
+ ) -> list[GmailAttachmentIngress]:
355
+ if part is None:
356
+ return []
357
+
358
+ attachments: list[GmailAttachmentIngress] = []
359
+ for mime_part, mime_path in _walk_mime_parts(part, mime_path="0"):
360
+ body = mime_part.body
361
+ attachment_id = non_empty_string(body.attachment_id if body else None)
362
+ inline_data = non_empty_string(body.data if body else None)
363
+
364
+ part_headers = _extract_part_headers(mime_part.headers)
365
+ filename = _normalize_attachment_filename(mime_part.filename)
366
+ if filename is None:
367
+ filename = _normalize_attachment_filename(
368
+ _extract_filename_from_part_headers(part_headers)
369
+ )
370
+
371
+ is_candidate = (
372
+ filename is not None
373
+ or attachment_id is not None
374
+ or mime_part.mime_type in _SPECIAL_ATTACHMENT_MIME_TYPES
375
+ )
376
+ if not is_candidate:
377
+ continue
378
+
379
+ if inline_data is None and attachment_id is None:
380
+ continue
381
+
382
+ part_id = canonicalize_part_id(
383
+ source_part_id=mime_part.part_id,
384
+ attachment_id=attachment_id,
385
+ mime_path=mime_path,
386
+ )
387
+ attachments.append(
388
+ GmailAttachmentIngress.model_validate(
389
+ {
390
+ "attachment_id": attachment_id,
391
+ "inline_data_b64url": (
392
+ inline_data if attachment_id is None else None
393
+ ),
394
+ "part_id": part_id,
395
+ "filename": filename,
396
+ "mime_type": mime_part.mime_type,
397
+ "size": body.size if body else None,
398
+ "content_disposition": _normalize_content_disposition(
399
+ part_headers.get("content-disposition")
400
+ ),
401
+ "content_id": _normalize_content_id(part_headers.get("content-id")),
402
+ }
403
+ )
404
+ )
405
+
406
+ return attachments
407
+
408
+
409
+ def _parse_date_header(value: str | None) -> datetime | None:
410
+ header_value = non_empty_string(value)
411
+ if header_value is None:
412
+ return None
413
+
414
+ try:
415
+ parsed = parsedate_to_datetime(header_value)
416
+ except (TypeError, ValueError):
417
+ return None
418
+
419
+ if parsed is None:
420
+ return None
421
+
422
+ if parsed.tzinfo is None:
423
+ return parsed.replace(tzinfo=timezone.utc)
424
+
425
+ return parsed
426
+
427
+
428
+ def _internal_date_to_datetime(value: int | None) -> datetime | None:
429
+ if value is None:
430
+ return None
431
+
432
+ try:
433
+ return datetime.fromtimestamp(value / 1000.0, tz=timezone.utc)
434
+ except (OverflowError, OSError, ValueError):
435
+ return None
436
+
437
+
438
+ def extract_changed_ids(
439
+ history_event: HistoryEventRow,
440
+ ) -> tuple[set[str], set[str]]:
441
+ message_ids: set[str] = set()
442
+ thread_ids: set[str] = set()
443
+
444
+ def push(message: Mapping[str, Any]) -> None:
445
+ message_id = message.get("id")
446
+ thread_id = message.get("threadId")
447
+ if message_id:
448
+ message_ids.add(str(message_id))
449
+ if thread_id:
450
+ thread_ids.add(str(thread_id))
451
+
452
+ for message in history_event.messages:
453
+ if isinstance(message, Mapping):
454
+ push(message)
455
+
456
+ history_wrappers = (
457
+ history_event.messages_added,
458
+ history_event.messages_deleted,
459
+ history_event.labels_added,
460
+ history_event.labels_removed,
461
+ )
462
+ for wrappers in history_wrappers:
463
+ for wrapper in wrappers:
464
+ if not isinstance(wrapper, Mapping):
465
+ continue
466
+ message = wrapper.get("message")
467
+ if isinstance(message, Mapping):
468
+ push(message)
469
+
470
+ return message_ids, thread_ids
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Annotated
4
+
5
+ from pydantic import Field
6
+ from shared_plugins.models import IdStr
7
+
8
+ MessageId = IdStr
9
+ ThreadId = IdStr
10
+ LabelId = IdStr
11
+ NonNegativeInt = Annotated[int, Field(ge=0, strict=True)]
12
+ HistoryId = NonNegativeInt
@@ -0,0 +1,9 @@
1
+ {
2
+ "auth": {
3
+ "provider_id": "google",
4
+ "scopes": ["https://www.googleapis.com/auth/gmail.readonly"],
5
+ "type": "oauth"
6
+ },
7
+ "mode": "dagster",
8
+ "plugin_id": "gmail"
9
+ }
File without changes