contextbase-plugin-gmail 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,494 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import time
6
+ from dataclasses import dataclass
7
+ from typing import Any, Callable, Iterator, TypeVar
8
+
9
+ from googleapiclient.errors import HttpError
10
+
11
+ from shared_plugins.exceptions import PluginConfigurationError, PluginCursorExpiredError
12
+ from shared_plugins.google_client.batch_retry import (
13
+ BatchRetryExhaustedError,
14
+ BatchRetryPolicy,
15
+ BatchTerminalSubrequestError,
16
+ execute_batch_with_failed_subset_retries,
17
+ )
18
+ from shared_plugins.google_client.http_errors import extract_http_status_code
19
+ from shared_plugins.models import IngressModel
20
+
21
+ from ..models.ingress import (
22
+ GmailHistoryListResponseIngress,
23
+ GmailHistoryRecordIngress,
24
+ GmailLabelIngress,
25
+ GmailLabelsListResponseIngress,
26
+ GmailMessageAttachmentIngress,
27
+ GmailMessageIngress,
28
+ GmailMessagesListResponseIngress,
29
+ GmailProfileIngress,
30
+ GmailThreadIngress,
31
+ GmailThreadsListResponseIngress,
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ T = TypeVar("T", bound=IngressModel)
37
+ USER_ID = "me"
38
+ LIST_MAX_RESULTS = 500
39
+ HISTORY_MAX_RESULTS = 500
40
+ API_NUM_RETRIES = 5
41
+ MESSAGES_GET_BATCH_MAX_SUBREQUESTS = 20
42
+ MESSAGES_GET_BATCH_RETRY_POLICY = BatchRetryPolicy()
43
+ MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS = 20
44
+ MESSAGES_ATTACHMENTS_GET_BATCH_RETRY_POLICY = BatchRetryPolicy()
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class HistoryPage:
49
+ history: list[GmailHistoryRecordIngress]
50
+ history_id: str
51
+
52
+
53
+ GmailServiceFactory = Callable[[], Any]
54
+
55
+
56
+ def load_gmail_service_factory(path: str) -> GmailServiceFactory:
57
+ module_name, _, attr = path.rpartition(".")
58
+ if not module_name or not attr:
59
+ raise PluginConfigurationError(
60
+ f"Invalid GMAIL_SERVICE_FACTORY '{path}'. Expected dotted path like 'pkg.module.make_service'."
61
+ )
62
+ module = importlib.import_module(module_name)
63
+ factory = getattr(module, attr, None)
64
+ if not callable(factory):
65
+ raise PluginConfigurationError(
66
+ f"GMAIL_SERVICE_FACTORY '{path}' does not resolve to a callable"
67
+ )
68
+ return factory
69
+
70
+
71
+ class GmailApiClient:
72
+ def __init__(self, *, service: Any) -> None:
73
+ if service is None:
74
+ raise PluginConfigurationError(
75
+ "No Gmail service provided. Pass an authenticated service via service=... "
76
+ "or resolve one from GMAIL_SERVICE_FACTORY in your orchestrator."
77
+ )
78
+ self._service = service
79
+
80
+ def _execute_model(self, request: Any, model_type: type[T]) -> T:
81
+ payload = request.execute(num_retries=API_NUM_RETRIES)
82
+ return model_type.model_validate(payload)
83
+
84
+ def get_profile(self) -> GmailProfileIngress:
85
+ request = self._service.users().getProfile(userId=USER_ID)
86
+ return self._execute_model(request, GmailProfileIngress)
87
+
88
+ def iter_labels(self) -> Iterator[GmailLabelIngress]:
89
+ request = self._service.users().labels().list(userId=USER_ID)
90
+ payload = self._execute_model(request, GmailLabelsListResponseIngress)
91
+ for label in payload.labels:
92
+ yield label
93
+
94
+ def iter_message_ids(
95
+ self,
96
+ *,
97
+ query: str | None = None,
98
+ label_ids: list[str] | None = None,
99
+ max_results: int | None = None,
100
+ ) -> Iterator[str]:
101
+ for message_ids in self.iter_message_id_pages(
102
+ query=query,
103
+ label_ids=label_ids,
104
+ max_results=max_results,
105
+ ):
106
+ for message_id in message_ids:
107
+ yield message_id
108
+
109
+ def iter_message_id_pages(
110
+ self,
111
+ *,
112
+ query: str | None = None,
113
+ label_ids: list[str] | None = None,
114
+ max_results: int | None = None,
115
+ ) -> Iterator[list[str]]:
116
+ page_token: str | None = None
117
+ page_size = max_results or LIST_MAX_RESULTS
118
+
119
+ while True:
120
+ kwargs: dict[str, Any] = {
121
+ "userId": USER_ID,
122
+ "maxResults": page_size,
123
+ }
124
+ if query:
125
+ kwargs["q"] = query
126
+ if label_ids:
127
+ kwargs["labelIds"] = label_ids
128
+ if page_token:
129
+ kwargs["pageToken"] = page_token
130
+
131
+ request = self._service.users().messages().list(**kwargs)
132
+ payload = self._execute_model(request, GmailMessagesListResponseIngress)
133
+
134
+ yield [item.id for item in payload.messages]
135
+
136
+ page_token = payload.next_page_token
137
+ if not page_token:
138
+ break
139
+
140
+ def _build_message_get_request(self, message_id: str, *, fmt: str) -> Any:
141
+ return (
142
+ self._service.users()
143
+ .messages()
144
+ .get(
145
+ userId=USER_ID,
146
+ id=message_id,
147
+ format=fmt,
148
+ )
149
+ )
150
+
151
+ def get_message(
152
+ self,
153
+ message_id: str,
154
+ *,
155
+ fmt: str = "full",
156
+ ) -> GmailMessageIngress:
157
+ request = self._build_message_get_request(message_id, fmt=fmt)
158
+ return self._execute_model(request, GmailMessageIngress)
159
+
160
+ def get_attachment(
161
+ self,
162
+ message_id: str,
163
+ attachment_id: str,
164
+ ) -> GmailMessageAttachmentIngress:
165
+ request = (
166
+ self._service.users()
167
+ .messages()
168
+ .attachments()
169
+ .get(
170
+ userId=USER_ID,
171
+ messageId=message_id,
172
+ id=attachment_id,
173
+ )
174
+ )
175
+ payload = request.execute(num_retries=API_NUM_RETRIES)
176
+ size: int | None = None
177
+ data: str | None = None
178
+ if isinstance(payload, dict):
179
+ raw_size = payload.get("size")
180
+ if isinstance(raw_size, int):
181
+ size = raw_size
182
+
183
+ raw_data = payload.get("data")
184
+ if isinstance(raw_data, str):
185
+ data = raw_data
186
+
187
+ return GmailMessageAttachmentIngress.model_validate(
188
+ {
189
+ "attachment_id": attachment_id,
190
+ "size": size,
191
+ "data": data,
192
+ }
193
+ )
194
+
195
+ def get_attachments_batch(
196
+ self,
197
+ keys: list[tuple[str, str]],
198
+ ) -> list[GmailMessageAttachmentIngress]:
199
+ if not keys:
200
+ return []
201
+ if len(keys) > MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS:
202
+ raise ValueError(
203
+ "keys exceeds Gmail batch limit: "
204
+ f"{len(keys)} > {MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS}"
205
+ )
206
+
207
+ request_id_to_key: dict[str, tuple[str, str]] = {}
208
+ request_factories: dict[str, Callable[[], Any]] = {}
209
+ for index, (message_id, attachment_id) in enumerate(keys):
210
+ request_id = str(index)
211
+ request_id_to_key[request_id] = (message_id, attachment_id)
212
+ request_factories[request_id] = lambda m_id=message_id, a_id=attachment_id: (
213
+ self._service.users()
214
+ .messages()
215
+ .attachments()
216
+ .get(userId=USER_ID, messageId=m_id, id=a_id)
217
+ )
218
+
219
+ try:
220
+ responses = execute_batch_with_failed_subset_retries(
221
+ request_factories=request_factories,
222
+ new_batch=lambda callback: self._service.new_batch_http_request(
223
+ callback=callback
224
+ ),
225
+ policy=MESSAGES_ATTACHMENTS_GET_BATCH_RETRY_POLICY,
226
+ )
227
+ except BatchTerminalSubrequestError as exc:
228
+ failed_key = request_id_to_key.get(exc.request_id)
229
+ raise RuntimeError(
230
+ "Gmail batch attachments.get failed with a non-retryable subrequest "
231
+ f"for key={failed_key}: {exc.exception}"
232
+ ) from exc.exception
233
+ except BatchRetryExhaustedError as exc:
234
+ failed_keys = [
235
+ str(request_id_to_key.get(request_id, request_id))
236
+ for request_id in exc.failed_request_ids
237
+ ]
238
+ sample_keys = ", ".join(failed_keys[:5])
239
+ raise RuntimeError(
240
+ "Gmail batch attachments.get failed after retries for "
241
+ f"{len(failed_keys)} of {len(keys)} subrequests. "
242
+ f"Sample failed keys: {sample_keys}"
243
+ ) from exc
244
+
245
+ attachments: list[GmailMessageAttachmentIngress] = []
246
+ for index, (_message_id, attachment_id) in enumerate(keys):
247
+ request_id = str(index)
248
+ if request_id not in responses:
249
+ raise RuntimeError(
250
+ "Gmail batch attachments.get missing response payload for "
251
+ f"key={request_id_to_key[request_id]}"
252
+ )
253
+ payload = responses[request_id]
254
+ size: int | None = None
255
+ data: str | None = None
256
+ if isinstance(payload, dict):
257
+ raw_size = payload.get("size")
258
+ if isinstance(raw_size, int):
259
+ size = raw_size
260
+ raw_data = payload.get("data")
261
+ if isinstance(raw_data, str):
262
+ data = raw_data
263
+ attachments.append(
264
+ GmailMessageAttachmentIngress.model_validate(
265
+ {
266
+ "attachment_id": attachment_id,
267
+ "size": size,
268
+ "data": data,
269
+ }
270
+ )
271
+ )
272
+ return attachments
273
+
274
+ def iter_attachment_batches(
275
+ self,
276
+ keys: list[tuple[str, str]],
277
+ *,
278
+ batch_size: int = MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS,
279
+ ) -> Iterator[list[GmailMessageAttachmentIngress]]:
280
+ if batch_size < 1:
281
+ raise ValueError("batch_size must be >= 1")
282
+ if batch_size > MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS:
283
+ raise ValueError(
284
+ "batch_size exceeds Gmail batch limit: "
285
+ f"{batch_size} > {MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS}"
286
+ )
287
+
288
+ prev_batch_end: float | None = None
289
+ for offset in range(0, len(keys), batch_size):
290
+ batch_keys = keys[offset : offset + batch_size]
291
+ batch_start = time.monotonic()
292
+ delta_since_prev = (
293
+ batch_start - prev_batch_end if prev_batch_end is not None else 0.0
294
+ )
295
+ logger.info(
296
+ "gmail attachments.get batch starting: size=%d offset=%d delta_since_prev=%.3fs",
297
+ len(batch_keys),
298
+ offset,
299
+ delta_since_prev,
300
+ )
301
+ batch = self.get_attachments_batch(batch_keys)
302
+ batch_end = time.monotonic()
303
+ duration = batch_end - batch_start
304
+ logger.info(
305
+ "gmail attachments.get batch complete: returned=%d duration=%.3fs rate=%.1f atts/s",
306
+ len(batch),
307
+ duration,
308
+ len(batch) / max(duration, 1e-9),
309
+ )
310
+ prev_batch_end = batch_end
311
+ yield batch
312
+
313
+ def get_messages_batch(
314
+ self,
315
+ message_ids: list[str],
316
+ *,
317
+ fmt: str = "full",
318
+ ) -> list[GmailMessageIngress]:
319
+ if not message_ids:
320
+ return []
321
+ if len(message_ids) > MESSAGES_GET_BATCH_MAX_SUBREQUESTS:
322
+ raise ValueError(
323
+ "message_ids exceeds Gmail batch limit: "
324
+ f"{len(message_ids)} > {MESSAGES_GET_BATCH_MAX_SUBREQUESTS}"
325
+ )
326
+
327
+ request_id_to_message_id: dict[str, str] = {}
328
+ request_factories: dict[str, Callable[[], Any]] = {}
329
+ for index, message_id in enumerate(message_ids):
330
+ request_id = str(index)
331
+ request_id_to_message_id[request_id] = message_id
332
+ request_factories[request_id] = lambda m_id=message_id: (
333
+ self._build_message_get_request(m_id, fmt=fmt)
334
+ )
335
+
336
+ try:
337
+ responses = execute_batch_with_failed_subset_retries(
338
+ request_factories=request_factories,
339
+ new_batch=lambda callback: self._service.new_batch_http_request(
340
+ callback=callback
341
+ ),
342
+ policy=MESSAGES_GET_BATCH_RETRY_POLICY,
343
+ )
344
+ except BatchTerminalSubrequestError as exc:
345
+ failed_message_id = request_id_to_message_id.get(
346
+ exc.request_id, exc.request_id
347
+ )
348
+ raise RuntimeError(
349
+ "Gmail batch messages.get failed with a non-retryable subrequest "
350
+ f"for message_id={failed_message_id}: {exc.exception}"
351
+ ) from exc.exception
352
+ except BatchRetryExhaustedError as exc:
353
+ failed_ids = [
354
+ request_id_to_message_id.get(request_id, request_id)
355
+ for request_id in exc.failed_request_ids
356
+ ]
357
+ sample_ids = ", ".join(failed_ids[:5])
358
+ raise RuntimeError(
359
+ "Gmail batch messages.get failed after retries for "
360
+ f"{len(failed_ids)} of {len(message_ids)} subrequests. "
361
+ f"Sample failed message IDs: {sample_ids}"
362
+ ) from exc
363
+
364
+ messages: list[GmailMessageIngress] = []
365
+ for index, message_id in enumerate(message_ids):
366
+ request_id = str(index)
367
+ if request_id not in responses:
368
+ raise RuntimeError(
369
+ "Gmail batch messages.get missing response payload for "
370
+ f"message_id={message_id}"
371
+ )
372
+ messages.append(GmailMessageIngress.model_validate(responses[request_id]))
373
+ return messages
374
+
375
+ def iter_message_batches(
376
+ self,
377
+ message_ids: list[str],
378
+ *,
379
+ batch_size: int = MESSAGES_GET_BATCH_MAX_SUBREQUESTS,
380
+ fmt: str = "full",
381
+ ) -> Iterator[list[GmailMessageIngress]]:
382
+ if batch_size < 1:
383
+ raise ValueError("batch_size must be >= 1")
384
+ if batch_size > MESSAGES_GET_BATCH_MAX_SUBREQUESTS:
385
+ raise ValueError(
386
+ "batch_size exceeds Gmail batch limit: "
387
+ f"{batch_size} > {MESSAGES_GET_BATCH_MAX_SUBREQUESTS}"
388
+ )
389
+
390
+ prev_batch_end: float | None = None
391
+ for offset in range(0, len(message_ids), batch_size):
392
+ batch_ids = message_ids[offset : offset + batch_size]
393
+ batch_start = time.monotonic()
394
+ delta_since_prev = (
395
+ batch_start - prev_batch_end if prev_batch_end is not None else 0.0
396
+ )
397
+ logger.info(
398
+ "gmail messages.get batch starting: size=%d offset=%d delta_since_prev=%.3fs",
399
+ len(batch_ids),
400
+ offset,
401
+ delta_since_prev,
402
+ )
403
+ batch = self.get_messages_batch(batch_ids, fmt=fmt)
404
+ batch_end = time.monotonic()
405
+ duration = batch_end - batch_start
406
+ logger.info(
407
+ "gmail messages.get batch complete: returned=%d duration=%.3fs rate=%.1f msgs/s",
408
+ len(batch),
409
+ duration,
410
+ len(batch) / max(duration, 1e-9),
411
+ )
412
+ prev_batch_end = batch_end
413
+ yield batch
414
+
415
+ def iter_thread_ids(
416
+ self,
417
+ *,
418
+ query: str | None = None,
419
+ label_ids: list[str] | None = None,
420
+ max_results: int | None = None,
421
+ ) -> Iterator[str]:
422
+ page_token: str | None = None
423
+ page_size = max_results or LIST_MAX_RESULTS
424
+
425
+ while True:
426
+ kwargs: dict[str, Any] = {
427
+ "userId": USER_ID,
428
+ "maxResults": page_size,
429
+ }
430
+ if query:
431
+ kwargs["q"] = query
432
+ if label_ids:
433
+ kwargs["labelIds"] = label_ids
434
+ if page_token:
435
+ kwargs["pageToken"] = page_token
436
+
437
+ request = self._service.users().threads().list(**kwargs)
438
+ payload = self._execute_model(request, GmailThreadsListResponseIngress)
439
+
440
+ for item in payload.threads:
441
+ yield item.id
442
+
443
+ page_token = payload.next_page_token
444
+ if not page_token:
445
+ break
446
+
447
+ def get_thread(self, thread_id: str, *, fmt: str = "full") -> GmailThreadIngress:
448
+ request = (
449
+ self._service.users()
450
+ .threads()
451
+ .get(
452
+ userId=USER_ID,
453
+ id=thread_id,
454
+ format=fmt,
455
+ )
456
+ )
457
+ return self._execute_model(request, GmailThreadIngress)
458
+
459
+ def iter_history_pages(
460
+ self,
461
+ *,
462
+ start_history_id: str,
463
+ max_results: int | None = None,
464
+ ) -> Iterator[HistoryPage]:
465
+ page_token: str | None = None
466
+ page_size = max_results or HISTORY_MAX_RESULTS
467
+
468
+ while True:
469
+ kwargs: dict[str, Any] = {
470
+ "userId": USER_ID,
471
+ "startHistoryId": str(start_history_id),
472
+ "maxResults": page_size,
473
+ }
474
+ if page_token:
475
+ kwargs["pageToken"] = page_token
476
+
477
+ try:
478
+ request = self._service.users().history().list(**kwargs)
479
+ payload = self._execute_model(request, GmailHistoryListResponseIngress)
480
+ except HttpError as exc:
481
+ if extract_http_status_code(exc) == 404:
482
+ raise PluginCursorExpiredError(
483
+ "Gmail history cursor is stale. Reset history source state and rerun history sync to reseed from the current profile historyId. "
484
+ "For strict completeness after expiration, run backfill as well."
485
+ ) from exc
486
+ raise
487
+
488
+ yield HistoryPage(
489
+ history=payload.history, history_id=str(payload.history_id)
490
+ )
491
+
492
+ page_token = payload.next_page_token
493
+ if not page_token:
494
+ break