contextbase-plugin-gmail 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections.abc import Iterable, Iterator, Mapping, Sequence
5
+ from typing import Any
6
+
7
+ import dlt
8
+ from dlt.destinations.sql_client import SqlClientBase
9
+ from pydantic import ValidationError
10
+ from shared_plugins.dlt import destination_has_table
11
+ from shared_plugins.models import format_validation_error
12
+ from shared_plugins.naming import (
13
+ dlt_resource_name,
14
+ dlt_source_name,
15
+ plugin_id_from_module,
16
+ )
17
+ from shared_plugins.resources import ctx_dlt_resource
18
+ from shared_plugins.values import non_empty_string
19
+
20
+ from ..models.ctx import AttachmentCandidateProjection, AttachmentRow
21
+ from ..models.ingress import GmailMessageAttachmentIngress
22
+ from ..models.translators import attachment_candidate_rows_to_ctx_models
23
+ from ..utils.attachments import (
24
+ AttachmentMaterializationError,
25
+ FetchedAttachmentMap,
26
+ build_attachment_rows_for_message,
27
+ )
28
+ from ..utils.client import (
29
+ MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS,
30
+ GmailApiClient,
31
+ )
32
+
33
+ PLUGIN_ID = plugin_id_from_module(__file__)
34
+ JOB = "attachment_content"
35
+ ATTACHMENT_CONTENT_CANDIDATE_LIMIT = 200
36
+ LOGGER = logging.getLogger(__name__)
37
+
38
+ AttachmentCandidate = AttachmentCandidateProjection
39
+
40
+
41
+ def parse_attachment_candidate(row: Mapping[str, Any]) -> AttachmentCandidateProjection:
42
+ return next(attachment_candidate_rows_to_ctx_models([row]))
43
+
44
+
45
+ def _validate_limit(limit: int) -> None:
46
+ if isinstance(limit, bool) or not isinstance(limit, int) or limit < 1:
47
+ raise ValueError("limit must be an integer >= 1")
48
+
49
+
50
+ def build_attachment_candidate_query(*, limit: int) -> str:
51
+ _validate_limit(limit)
52
+
53
+ return """
54
+ SELECT
55
+ m.id,
56
+ m.attachment_count,
57
+ m.attachments,
58
+ COALESCE(a.attachment_row_count, 0) AS existing_attachment_count
59
+ FROM messages AS m
60
+ LEFT JOIN (
61
+ SELECT
62
+ message_id,
63
+ COUNT(*)::bigint AS attachment_row_count
64
+ FROM attachments
65
+ WHERE _ctx_binding_id = %s
66
+ GROUP BY message_id
67
+ ) AS a
68
+ ON a.message_id = m.id
69
+ WHERE m._ctx_binding_id = %s
70
+ AND m.attachment_count > 0
71
+ AND m.attachment_count <> COALESCE(a.attachment_row_count, 0)
72
+ ORDER BY m._ctx_source_updated_at DESC NULLS LAST, m.id ASC
73
+ LIMIT %s
74
+ """.strip()
75
+
76
+
77
+ def build_attachment_candidate_query_bootstrap(*, limit: int) -> str:
78
+ _validate_limit(limit)
79
+
80
+ return """
81
+ SELECT
82
+ m.id,
83
+ m.attachment_count,
84
+ m.attachments,
85
+ 0::bigint AS existing_attachment_count
86
+ FROM messages AS m
87
+ WHERE m._ctx_binding_id = %s
88
+ AND m.attachment_count > 0
89
+ ORDER BY m._ctx_source_updated_at DESC NULLS LAST, m.id ASC
90
+ LIMIT %s
91
+ """.strip()
92
+
93
+
94
+ def iter_attachment_candidate_rows(
95
+ sql_client: SqlClientBase[Any],
96
+ *,
97
+ binding_id: str,
98
+ limit: int,
99
+ ) -> Iterator[dict[str, Any]]:
100
+ if destination_has_table(sql_client, "attachments"):
101
+ query = build_attachment_candidate_query(limit=limit)
102
+ yield from _iter_query_rows(sql_client, query, binding_id, binding_id, limit)
103
+ else:
104
+ LOGGER.info(
105
+ "gmail.attachments.bootstrap binding_id=%s reason=attachments table does not exist yet; all candidates treated as new",
106
+ binding_id,
107
+ )
108
+ query = build_attachment_candidate_query_bootstrap(limit=limit)
109
+ yield from _iter_query_rows(sql_client, query, binding_id, limit)
110
+
111
+
112
+ def _parse_attachment_candidate_row(
113
+ *,
114
+ binding_id: str,
115
+ raw_candidate: Mapping[str, Any],
116
+ ) -> AttachmentCandidateProjection | None:
117
+ message_id = non_empty_string(raw_candidate.get("id")) or "-"
118
+ try:
119
+ return parse_attachment_candidate(raw_candidate)
120
+ except ValidationError as exc:
121
+ LOGGER.error(
122
+ "gmail.attachments.materialization_failed binding_id=%s message_id=%s part_id=- attachment_id=- reason=Candidate message row failed validation: %s",
123
+ binding_id,
124
+ message_id,
125
+ format_validation_error(exc),
126
+ )
127
+ return None
128
+
129
+
130
+ def _rebuild_attachment_rows(
131
+ *,
132
+ binding_id: str,
133
+ candidate: AttachmentCandidateProjection,
134
+ fetched_map: FetchedAttachmentMap,
135
+ ) -> list[AttachmentRow] | None:
136
+ if candidate.attachment_count > 0 and len(candidate.attachments) == 0:
137
+ LOGGER.error(
138
+ "gmail.attachments.materialization_failed binding_id=%s message_id=%s part_id=- attachment_id=- reason=Candidate message has attachment_count>0 with empty attachments metadata",
139
+ binding_id,
140
+ candidate.message_id,
141
+ )
142
+ return None
143
+
144
+ try:
145
+ message_rows = build_attachment_rows_for_message(
146
+ binding_id=binding_id,
147
+ message_id=candidate.message_id,
148
+ attachments=candidate.attachments,
149
+ fetched_map=fetched_map,
150
+ )
151
+ except AttachmentMaterializationError as exc:
152
+ LOGGER.error(
153
+ "gmail.attachments.materialization_failed binding_id=%s message_id=%s part_id=%s attachment_id=%s reason=%s",
154
+ binding_id,
155
+ candidate.message_id,
156
+ exc.part_id or "-",
157
+ exc.attachment_id or "-",
158
+ exc.reason,
159
+ )
160
+ return None
161
+
162
+ LOGGER.info(
163
+ "gmail.attachments.message_rebuilt binding_id=%s message_id=%s expected_count=%d existing_count=%d rebuilt_count=%d",
164
+ binding_id,
165
+ candidate.message_id,
166
+ candidate.attachment_count,
167
+ candidate.existing_attachment_count,
168
+ len(message_rows),
169
+ )
170
+ return message_rows
171
+
172
+
173
+ def _fetch_keys_for_candidate(
174
+ candidate: AttachmentCandidateProjection,
175
+ ) -> list[tuple[str, str]]:
176
+ return [
177
+ (candidate.message_id, attachment.attachment_id)
178
+ for attachment in candidate.attachments
179
+ if attachment.attachment_id is not None
180
+ ]
181
+
182
+
183
+ def _group_candidates_by_fetch_budget(
184
+ candidates: Iterable[AttachmentCandidateProjection],
185
+ *,
186
+ max_fetches_per_group: int,
187
+ ) -> Iterator[list[AttachmentCandidateProjection]]:
188
+ if max_fetches_per_group < 1:
189
+ raise ValueError("max_fetches_per_group must be >= 1")
190
+
191
+ group: list[AttachmentCandidateProjection] = []
192
+ group_fetch_count = 0
193
+ for candidate in candidates:
194
+ fetch_count = len(_fetch_keys_for_candidate(candidate))
195
+ if group and group_fetch_count + fetch_count > max_fetches_per_group:
196
+ yield group
197
+ group = []
198
+ group_fetch_count = 0
199
+ group.append(candidate)
200
+ group_fetch_count += fetch_count
201
+ if group:
202
+ yield group
203
+
204
+
205
+ def _fetch_attachments_for_group(
206
+ client: GmailApiClient,
207
+ group: Sequence[AttachmentCandidateProjection],
208
+ ) -> dict[tuple[str, str], GmailMessageAttachmentIngress]:
209
+ all_keys: list[tuple[str, str]] = []
210
+ for candidate in group:
211
+ all_keys.extend(_fetch_keys_for_candidate(candidate))
212
+
213
+ fetched_map: dict[tuple[str, str], GmailMessageAttachmentIngress] = {}
214
+ offset = 0
215
+ for batch in client.iter_attachment_batches(all_keys):
216
+ batch_keys = all_keys[offset : offset + len(batch)]
217
+ fetched_map.update(zip(batch_keys, batch))
218
+ offset += len(batch)
219
+ return fetched_map
220
+
221
+
222
+ @dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
223
+ def gmail_attachment_content_source(
224
+ binding_id: str,
225
+ *,
226
+ client: GmailApiClient,
227
+ ) -> tuple[Any, ...]:
228
+ @ctx_dlt_resource(
229
+ name=dlt_resource_name("attachments"),
230
+ write_disposition={"disposition": "merge", "strategy": "delete-insert"},
231
+ primary_key=("_ctx_binding_id", "message_id", "part_id"),
232
+ merge_key=("_ctx_binding_id", "message_id"),
233
+ columns={
234
+ "content_id": {"data_type": "text"},
235
+ "file_path": {"data_type": "text"},
236
+ },
237
+ )
238
+ def attachments_resource() -> Iterator[AttachmentRow]:
239
+ candidate_count = 0
240
+ successful_messages = 0
241
+ failed_messages = 0
242
+ rows_written = 0
243
+
244
+ with dlt.current.pipeline().sql_client() as sql_client:
245
+ raw_candidates = list(
246
+ iter_attachment_candidate_rows(
247
+ sql_client,
248
+ binding_id=binding_id,
249
+ limit=ATTACHMENT_CONTENT_CANDIDATE_LIMIT,
250
+ )
251
+ )
252
+
253
+ candidates: list[AttachmentCandidateProjection] = []
254
+ for raw_candidate in raw_candidates:
255
+ candidate_count += 1
256
+ candidate = _parse_attachment_candidate_row(
257
+ binding_id=binding_id,
258
+ raw_candidate=raw_candidate,
259
+ )
260
+ if candidate is None:
261
+ failed_messages += 1
262
+ continue
263
+ candidates.append(candidate)
264
+
265
+ for group in _group_candidates_by_fetch_budget(
266
+ candidates,
267
+ max_fetches_per_group=MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS,
268
+ ):
269
+ fetched_map = _fetch_attachments_for_group(client, group)
270
+ for candidate in group:
271
+ message_rows = _rebuild_attachment_rows(
272
+ binding_id=binding_id,
273
+ candidate=candidate,
274
+ fetched_map=fetched_map,
275
+ )
276
+ if message_rows is None:
277
+ failed_messages += 1
278
+ continue
279
+
280
+ successful_messages += 1
281
+ rows_written += len(message_rows)
282
+ yield from message_rows
283
+
284
+ LOGGER.info(
285
+ "gmail.attachments.run_complete binding_id=%s candidates=%d rebuilt_messages=%d failed_messages=%d rows_written=%d",
286
+ binding_id,
287
+ candidate_count,
288
+ successful_messages,
289
+ failed_messages,
290
+ rows_written,
291
+ )
292
+
293
+ return (attachments_resource,)
294
+
295
+
296
+ def _iter_query_rows(
297
+ sql_client: SqlClientBase[Any],
298
+ query: str,
299
+ *params: Any,
300
+ ) -> Iterator[dict[str, Any]]:
301
+ with sql_client.execute_query(query, *params) as cursor:
302
+ if cursor.description is None:
303
+ return
304
+
305
+ columns = [column[0] for column in cursor.description]
306
+ for row in cursor.fetchall():
307
+ yield dict(zip(columns, row))
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+ from collections.abc import Iterator
6
+ from typing import Any
7
+
8
+ import dlt
9
+ from shared_plugins.naming import (
10
+ dlt_resource_name,
11
+ dlt_source_name,
12
+ plugin_id_from_module,
13
+ )
14
+ from shared_plugins.resources import ctx_dlt_resource
15
+
16
+ from ..models.ctx import MESSAGES_COLUMNS, LabelRow, MessageRow, ProfileRow
17
+ from ..models.translators import (
18
+ labels_to_ctx_models,
19
+ messages_to_ctx_models,
20
+ profiles_to_ctx_models,
21
+ )
22
+ from ..utils.client import MESSAGES_GET_BATCH_MAX_SUBREQUESTS, GmailApiClient
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ PLUGIN_ID = plugin_id_from_module(__file__)
27
+ JOB = "backfill"
28
+ BACKFILL_MESSAGES_MAX_RESULTS = 50_000
29
+
30
+
31
+ @dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
32
+ def gmail_backfill_source(
33
+ binding_id: str,
34
+ *,
35
+ client: GmailApiClient,
36
+ ) -> tuple[Any, ...]:
37
+ """Gmail bootstrap over profile, labels, and messages for one binding."""
38
+
39
+ @ctx_dlt_resource(
40
+ name=dlt_resource_name("profile"),
41
+ write_disposition="merge",
42
+ primary_key=("_ctx_binding_id",),
43
+ )
44
+ def profile_resource() -> Iterator[ProfileRow]:
45
+ yield from profiles_to_ctx_models(binding_id, [client.get_profile()])
46
+
47
+ @ctx_dlt_resource(
48
+ name=dlt_resource_name("labels"),
49
+ write_disposition="merge",
50
+ primary_key=("_ctx_binding_id", "id"),
51
+ )
52
+ def labels_resource() -> Iterator[LabelRow]:
53
+ yield from labels_to_ctx_models(binding_id, client.iter_labels())
54
+
55
+ @ctx_dlt_resource(
56
+ name=dlt_resource_name("messages"),
57
+ write_disposition="merge",
58
+ primary_key=("_ctx_binding_id", "id"),
59
+ columns=MESSAGES_COLUMNS,
60
+ )
61
+ def messages_resource() -> Iterator[MessageRow]:
62
+ count = 0
63
+ page_count = 0
64
+ batch_count = 0
65
+ t0 = time.monotonic()
66
+ logger.info(
67
+ "messages_resource: starting message page enumeration (limit=%d)",
68
+ BACKFILL_MESSAGES_MAX_RESULTS,
69
+ )
70
+ for message_ids in client.iter_message_id_pages():
71
+ remaining = BACKFILL_MESSAGES_MAX_RESULTS - count
72
+ if remaining <= 0:
73
+ logger.info(
74
+ "messages_resource: reached message limit=%d, stopping",
75
+ BACKFILL_MESSAGES_MAX_RESULTS,
76
+ )
77
+ break
78
+
79
+ message_ids_to_process = message_ids[:remaining]
80
+ page_count += 1
81
+ elapsed = time.monotonic() - t0
82
+ logger.info(
83
+ "messages_resource: page %d enumerated %d IDs, processing %d IDs (%.1fs elapsed)",
84
+ page_count,
85
+ len(message_ids),
86
+ len(message_ids_to_process),
87
+ elapsed,
88
+ )
89
+
90
+ for message_batch in client.iter_message_batches(
91
+ message_ids_to_process,
92
+ batch_size=MESSAGES_GET_BATCH_MAX_SUBREQUESTS,
93
+ ):
94
+ batch_count += 1
95
+ count += len(message_batch)
96
+ yield from messages_to_ctx_models(binding_id, message_batch)
97
+
98
+ if batch_count % 10 == 0:
99
+ elapsed = time.monotonic() - t0
100
+ logger.info(
101
+ "messages_resource: fetched %d messages in %d batches across %d pages (%.1fs elapsed)",
102
+ count,
103
+ batch_count,
104
+ page_count,
105
+ elapsed,
106
+ )
107
+
108
+ if count >= BACKFILL_MESSAGES_MAX_RESULTS:
109
+ logger.info(
110
+ "messages_resource: reached message limit=%d after page %d",
111
+ BACKFILL_MESSAGES_MAX_RESULTS,
112
+ page_count,
113
+ )
114
+ break
115
+
116
+ elapsed = time.monotonic() - t0
117
+ logger.info(
118
+ "messages_resource: finished %d messages across %d pages in %d batches (%.1fs)",
119
+ count,
120
+ page_count,
121
+ batch_count,
122
+ elapsed,
123
+ )
124
+
125
+ return (
126
+ profile_resource,
127
+ labels_resource,
128
+ messages_resource,
129
+ )
@@ -0,0 +1,160 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import Any
5
+
6
+ import dlt
7
+ from dlt.destinations.sql_client import SqlClientBase
8
+ from shared_plugins.google_client.http_errors import extract_http_status_code
9
+ from shared_plugins.dlt import destination_has_table
10
+ from shared_plugins.naming import (
11
+ dlt_resource_name,
12
+ dlt_source_name,
13
+ plugin_id_from_module,
14
+ )
15
+ from shared_plugins.resources import ctx_dlt_resource, ctx_dlt_transformer
16
+ from shared_plugins.values import require_non_negative_int
17
+
18
+ from ..models.ctx import (
19
+ MESSAGES_COLUMNS,
20
+ HistoryEventRow,
21
+ LabelRow,
22
+ MessageRow,
23
+ ProfileRow,
24
+ )
25
+ from ..models.translators import (
26
+ extract_changed_ids,
27
+ history_events_to_ctx_models,
28
+ labels_to_ctx_models,
29
+ messages_to_ctx_models,
30
+ profiles_to_ctx_models,
31
+ utc_now_iso,
32
+ )
33
+ from ..utils.client import GmailApiClient
34
+
35
+ HISTORY_CURSOR_KEY = "history_cursor"
36
+ HISTORY_LAST_SYNCED_AT_KEY = "history_last_synced_at"
37
+ HISTORY_SEEDED_AT_KEY = "history_seeded_at"
38
+ PLUGIN_ID = plugin_id_from_module(__file__)
39
+ JOB = "history"
40
+
41
+
42
+ def read_max_messages_history_id(
43
+ sql_client: SqlClientBase[Any],
44
+ *,
45
+ binding_id: str,
46
+ ) -> int | None:
47
+ if not destination_has_table(sql_client, "messages"):
48
+ return None
49
+
50
+ with sql_client.execute_query(
51
+ """
52
+ SELECT MAX(history_id)
53
+ FROM messages
54
+ WHERE _ctx_binding_id = %s
55
+ """.strip(),
56
+ binding_id,
57
+ ) as cursor:
58
+ row = cursor.fetchone()
59
+ if row is None:
60
+ return None
61
+ max_history_id = row[0]
62
+
63
+ if max_history_id is None:
64
+ return None
65
+
66
+ return require_non_negative_int(max_history_id)
67
+
68
+
69
+ @dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
70
+ def gmail_history_source(
71
+ binding_id: str,
72
+ *,
73
+ client: GmailApiClient,
74
+ ) -> tuple[Any, ...]:
75
+ """Incremental Gmail history polling job with explicit cursor state."""
76
+
77
+ seen_message_ids: set[str] = set()
78
+
79
+ @ctx_dlt_resource(
80
+ name=dlt_resource_name("profile"),
81
+ write_disposition="merge",
82
+ primary_key=("_ctx_binding_id",),
83
+ )
84
+ def profile_snapshot_resource() -> Iterator[ProfileRow]:
85
+ yield from profiles_to_ctx_models(binding_id, [client.get_profile()])
86
+
87
+ @ctx_dlt_resource(
88
+ name=dlt_resource_name("labels"),
89
+ write_disposition="merge",
90
+ primary_key=("_ctx_binding_id", "id"),
91
+ )
92
+ def labels_snapshot_resource() -> Iterator[LabelRow]:
93
+ yield from labels_to_ctx_models(binding_id, client.iter_labels())
94
+
95
+ @ctx_dlt_resource(
96
+ name=dlt_resource_name("history_events"),
97
+ write_disposition="append",
98
+ primary_key=("_ctx_binding_id", "id"),
99
+ )
100
+ def history_events_resource() -> Iterator[HistoryEventRow]:
101
+ source_state = dlt.current.source_state()
102
+ cursor = source_state.get(HISTORY_CURSOR_KEY)
103
+
104
+ if not cursor:
105
+ with dlt.current.pipeline().sql_client() as sql_client:
106
+ seeded_cursor = read_max_messages_history_id(
107
+ sql_client,
108
+ binding_id=binding_id,
109
+ )
110
+
111
+ if seeded_cursor is None:
112
+ return
113
+
114
+ source_state[HISTORY_CURSOR_KEY] = str(seeded_cursor)
115
+ source_state[HISTORY_SEEDED_AT_KEY] = utc_now_iso()
116
+ return
117
+
118
+ start_cursor = str(require_non_negative_int(cursor))
119
+ latest_cursor = start_cursor
120
+ for page in client.iter_history_pages(start_history_id=start_cursor):
121
+ latest_cursor = page.history_id
122
+ yield from history_events_to_ctx_models(binding_id, page.history)
123
+
124
+ source_state[HISTORY_CURSOR_KEY] = latest_cursor
125
+ source_state[HISTORY_LAST_SYNCED_AT_KEY] = utc_now_iso()
126
+
127
+ @ctx_dlt_transformer(
128
+ data_from=history_events_resource,
129
+ name=dlt_resource_name("messages"),
130
+ write_disposition="merge",
131
+ primary_key=("_ctx_binding_id", "id"),
132
+ columns=MESSAGES_COLUMNS,
133
+ )
134
+ def messages_from_history(
135
+ event_record: HistoryEventRow | dict[str, Any],
136
+ ) -> Iterator[MessageRow]:
137
+ event_row = (
138
+ event_record
139
+ if isinstance(event_record, HistoryEventRow)
140
+ else HistoryEventRow.model_validate(event_record)
141
+ )
142
+ message_ids, _ = extract_changed_ids(event_row)
143
+ for message_id in sorted(message_ids):
144
+ if message_id in seen_message_ids:
145
+ continue
146
+ seen_message_ids.add(message_id)
147
+ try:
148
+ message = client.get_message(message_id)
149
+ except Exception as exc:
150
+ if extract_http_status_code(exc) == 404:
151
+ continue
152
+ raise
153
+ yield from messages_to_ctx_models(binding_id, [message])
154
+
155
+ return (
156
+ profile_snapshot_resource,
157
+ labels_snapshot_resource,
158
+ history_events_resource,
159
+ messages_from_history,
160
+ )
File without changes