contextbase-plugin-gmail 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextbase_plugin_gmail-0.2.6.dist-info/METADATA +13 -0
- contextbase_plugin_gmail-0.2.6.dist-info/RECORD +21 -0
- contextbase_plugin_gmail-0.2.6.dist-info/WHEEL +4 -0
- plugin_gmail/__init__.py +0 -0
- plugin_gmail/binding_config.py +13 -0
- plugin_gmail/component.py +269 -0
- plugin_gmail/defs/__init__.py +0 -0
- plugin_gmail/defs/defs.yaml +1 -0
- plugin_gmail/models/__init__.py +0 -0
- plugin_gmail/models/ctx.py +132 -0
- plugin_gmail/models/ingress.py +185 -0
- plugin_gmail/models/translators.py +470 -0
- plugin_gmail/models/types.py +12 -0
- plugin_gmail/plugin.json +9 -0
- plugin_gmail/sources/__init__.py +0 -0
- plugin_gmail/sources/attachments.py +307 -0
- plugin_gmail/sources/backfill.py +129 -0
- plugin_gmail/sources/history.py +160 -0
- plugin_gmail/utils/__init__.py +0 -0
- plugin_gmail/utils/attachments.py +251 -0
- plugin_gmail/utils/client.py +494 -0
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import dlt
|
|
8
|
+
from dlt.destinations.sql_client import SqlClientBase
|
|
9
|
+
from pydantic import ValidationError
|
|
10
|
+
from shared_plugins.dlt import destination_has_table
|
|
11
|
+
from shared_plugins.models import format_validation_error
|
|
12
|
+
from shared_plugins.naming import (
|
|
13
|
+
dlt_resource_name,
|
|
14
|
+
dlt_source_name,
|
|
15
|
+
plugin_id_from_module,
|
|
16
|
+
)
|
|
17
|
+
from shared_plugins.resources import ctx_dlt_resource
|
|
18
|
+
from shared_plugins.values import non_empty_string
|
|
19
|
+
|
|
20
|
+
from ..models.ctx import AttachmentCandidateProjection, AttachmentRow
|
|
21
|
+
from ..models.ingress import GmailMessageAttachmentIngress
|
|
22
|
+
from ..models.translators import attachment_candidate_rows_to_ctx_models
|
|
23
|
+
from ..utils.attachments import (
|
|
24
|
+
AttachmentMaterializationError,
|
|
25
|
+
FetchedAttachmentMap,
|
|
26
|
+
build_attachment_rows_for_message,
|
|
27
|
+
)
|
|
28
|
+
from ..utils.client import (
|
|
29
|
+
MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS,
|
|
30
|
+
GmailApiClient,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
34
|
+
JOB = "attachment_content"
|
|
35
|
+
ATTACHMENT_CONTENT_CANDIDATE_LIMIT = 200
|
|
36
|
+
LOGGER = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
AttachmentCandidate = AttachmentCandidateProjection
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_attachment_candidate(row: Mapping[str, Any]) -> AttachmentCandidateProjection:
|
|
42
|
+
return next(attachment_candidate_rows_to_ctx_models([row]))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _validate_limit(limit: int) -> None:
|
|
46
|
+
if isinstance(limit, bool) or not isinstance(limit, int) or limit < 1:
|
|
47
|
+
raise ValueError("limit must be an integer >= 1")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def build_attachment_candidate_query(*, limit: int) -> str:
|
|
51
|
+
_validate_limit(limit)
|
|
52
|
+
|
|
53
|
+
return """
|
|
54
|
+
SELECT
|
|
55
|
+
m.id,
|
|
56
|
+
m.attachment_count,
|
|
57
|
+
m.attachments,
|
|
58
|
+
COALESCE(a.attachment_row_count, 0) AS existing_attachment_count
|
|
59
|
+
FROM messages AS m
|
|
60
|
+
LEFT JOIN (
|
|
61
|
+
SELECT
|
|
62
|
+
message_id,
|
|
63
|
+
COUNT(*)::bigint AS attachment_row_count
|
|
64
|
+
FROM attachments
|
|
65
|
+
WHERE _ctx_binding_id = %s
|
|
66
|
+
GROUP BY message_id
|
|
67
|
+
) AS a
|
|
68
|
+
ON a.message_id = m.id
|
|
69
|
+
WHERE m._ctx_binding_id = %s
|
|
70
|
+
AND m.attachment_count > 0
|
|
71
|
+
AND m.attachment_count <> COALESCE(a.attachment_row_count, 0)
|
|
72
|
+
ORDER BY m._ctx_source_updated_at DESC NULLS LAST, m.id ASC
|
|
73
|
+
LIMIT %s
|
|
74
|
+
""".strip()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def build_attachment_candidate_query_bootstrap(*, limit: int) -> str:
|
|
78
|
+
_validate_limit(limit)
|
|
79
|
+
|
|
80
|
+
return """
|
|
81
|
+
SELECT
|
|
82
|
+
m.id,
|
|
83
|
+
m.attachment_count,
|
|
84
|
+
m.attachments,
|
|
85
|
+
0::bigint AS existing_attachment_count
|
|
86
|
+
FROM messages AS m
|
|
87
|
+
WHERE m._ctx_binding_id = %s
|
|
88
|
+
AND m.attachment_count > 0
|
|
89
|
+
ORDER BY m._ctx_source_updated_at DESC NULLS LAST, m.id ASC
|
|
90
|
+
LIMIT %s
|
|
91
|
+
""".strip()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def iter_attachment_candidate_rows(
|
|
95
|
+
sql_client: SqlClientBase[Any],
|
|
96
|
+
*,
|
|
97
|
+
binding_id: str,
|
|
98
|
+
limit: int,
|
|
99
|
+
) -> Iterator[dict[str, Any]]:
|
|
100
|
+
if destination_has_table(sql_client, "attachments"):
|
|
101
|
+
query = build_attachment_candidate_query(limit=limit)
|
|
102
|
+
yield from _iter_query_rows(sql_client, query, binding_id, binding_id, limit)
|
|
103
|
+
else:
|
|
104
|
+
LOGGER.info(
|
|
105
|
+
"gmail.attachments.bootstrap binding_id=%s reason=attachments table does not exist yet; all candidates treated as new",
|
|
106
|
+
binding_id,
|
|
107
|
+
)
|
|
108
|
+
query = build_attachment_candidate_query_bootstrap(limit=limit)
|
|
109
|
+
yield from _iter_query_rows(sql_client, query, binding_id, limit)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _parse_attachment_candidate_row(
|
|
113
|
+
*,
|
|
114
|
+
binding_id: str,
|
|
115
|
+
raw_candidate: Mapping[str, Any],
|
|
116
|
+
) -> AttachmentCandidateProjection | None:
|
|
117
|
+
message_id = non_empty_string(raw_candidate.get("id")) or "-"
|
|
118
|
+
try:
|
|
119
|
+
return parse_attachment_candidate(raw_candidate)
|
|
120
|
+
except ValidationError as exc:
|
|
121
|
+
LOGGER.error(
|
|
122
|
+
"gmail.attachments.materialization_failed binding_id=%s message_id=%s part_id=- attachment_id=- reason=Candidate message row failed validation: %s",
|
|
123
|
+
binding_id,
|
|
124
|
+
message_id,
|
|
125
|
+
format_validation_error(exc),
|
|
126
|
+
)
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _rebuild_attachment_rows(
|
|
131
|
+
*,
|
|
132
|
+
binding_id: str,
|
|
133
|
+
candidate: AttachmentCandidateProjection,
|
|
134
|
+
fetched_map: FetchedAttachmentMap,
|
|
135
|
+
) -> list[AttachmentRow] | None:
|
|
136
|
+
if candidate.attachment_count > 0 and len(candidate.attachments) == 0:
|
|
137
|
+
LOGGER.error(
|
|
138
|
+
"gmail.attachments.materialization_failed binding_id=%s message_id=%s part_id=- attachment_id=- reason=Candidate message has attachment_count>0 with empty attachments metadata",
|
|
139
|
+
binding_id,
|
|
140
|
+
candidate.message_id,
|
|
141
|
+
)
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
message_rows = build_attachment_rows_for_message(
|
|
146
|
+
binding_id=binding_id,
|
|
147
|
+
message_id=candidate.message_id,
|
|
148
|
+
attachments=candidate.attachments,
|
|
149
|
+
fetched_map=fetched_map,
|
|
150
|
+
)
|
|
151
|
+
except AttachmentMaterializationError as exc:
|
|
152
|
+
LOGGER.error(
|
|
153
|
+
"gmail.attachments.materialization_failed binding_id=%s message_id=%s part_id=%s attachment_id=%s reason=%s",
|
|
154
|
+
binding_id,
|
|
155
|
+
candidate.message_id,
|
|
156
|
+
exc.part_id or "-",
|
|
157
|
+
exc.attachment_id or "-",
|
|
158
|
+
exc.reason,
|
|
159
|
+
)
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
LOGGER.info(
|
|
163
|
+
"gmail.attachments.message_rebuilt binding_id=%s message_id=%s expected_count=%d existing_count=%d rebuilt_count=%d",
|
|
164
|
+
binding_id,
|
|
165
|
+
candidate.message_id,
|
|
166
|
+
candidate.attachment_count,
|
|
167
|
+
candidate.existing_attachment_count,
|
|
168
|
+
len(message_rows),
|
|
169
|
+
)
|
|
170
|
+
return message_rows
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _fetch_keys_for_candidate(
|
|
174
|
+
candidate: AttachmentCandidateProjection,
|
|
175
|
+
) -> list[tuple[str, str]]:
|
|
176
|
+
return [
|
|
177
|
+
(candidate.message_id, attachment.attachment_id)
|
|
178
|
+
for attachment in candidate.attachments
|
|
179
|
+
if attachment.attachment_id is not None
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _group_candidates_by_fetch_budget(
|
|
184
|
+
candidates: Iterable[AttachmentCandidateProjection],
|
|
185
|
+
*,
|
|
186
|
+
max_fetches_per_group: int,
|
|
187
|
+
) -> Iterator[list[AttachmentCandidateProjection]]:
|
|
188
|
+
if max_fetches_per_group < 1:
|
|
189
|
+
raise ValueError("max_fetches_per_group must be >= 1")
|
|
190
|
+
|
|
191
|
+
group: list[AttachmentCandidateProjection] = []
|
|
192
|
+
group_fetch_count = 0
|
|
193
|
+
for candidate in candidates:
|
|
194
|
+
fetch_count = len(_fetch_keys_for_candidate(candidate))
|
|
195
|
+
if group and group_fetch_count + fetch_count > max_fetches_per_group:
|
|
196
|
+
yield group
|
|
197
|
+
group = []
|
|
198
|
+
group_fetch_count = 0
|
|
199
|
+
group.append(candidate)
|
|
200
|
+
group_fetch_count += fetch_count
|
|
201
|
+
if group:
|
|
202
|
+
yield group
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _fetch_attachments_for_group(
|
|
206
|
+
client: GmailApiClient,
|
|
207
|
+
group: Sequence[AttachmentCandidateProjection],
|
|
208
|
+
) -> dict[tuple[str, str], GmailMessageAttachmentIngress]:
|
|
209
|
+
all_keys: list[tuple[str, str]] = []
|
|
210
|
+
for candidate in group:
|
|
211
|
+
all_keys.extend(_fetch_keys_for_candidate(candidate))
|
|
212
|
+
|
|
213
|
+
fetched_map: dict[tuple[str, str], GmailMessageAttachmentIngress] = {}
|
|
214
|
+
offset = 0
|
|
215
|
+
for batch in client.iter_attachment_batches(all_keys):
|
|
216
|
+
batch_keys = all_keys[offset : offset + len(batch)]
|
|
217
|
+
fetched_map.update(zip(batch_keys, batch))
|
|
218
|
+
offset += len(batch)
|
|
219
|
+
return fetched_map
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
|
|
223
|
+
def gmail_attachment_content_source(
|
|
224
|
+
binding_id: str,
|
|
225
|
+
*,
|
|
226
|
+
client: GmailApiClient,
|
|
227
|
+
) -> tuple[Any, ...]:
|
|
228
|
+
@ctx_dlt_resource(
|
|
229
|
+
name=dlt_resource_name("attachments"),
|
|
230
|
+
write_disposition={"disposition": "merge", "strategy": "delete-insert"},
|
|
231
|
+
primary_key=("_ctx_binding_id", "message_id", "part_id"),
|
|
232
|
+
merge_key=("_ctx_binding_id", "message_id"),
|
|
233
|
+
columns={
|
|
234
|
+
"content_id": {"data_type": "text"},
|
|
235
|
+
"file_path": {"data_type": "text"},
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
def attachments_resource() -> Iterator[AttachmentRow]:
|
|
239
|
+
candidate_count = 0
|
|
240
|
+
successful_messages = 0
|
|
241
|
+
failed_messages = 0
|
|
242
|
+
rows_written = 0
|
|
243
|
+
|
|
244
|
+
with dlt.current.pipeline().sql_client() as sql_client:
|
|
245
|
+
raw_candidates = list(
|
|
246
|
+
iter_attachment_candidate_rows(
|
|
247
|
+
sql_client,
|
|
248
|
+
binding_id=binding_id,
|
|
249
|
+
limit=ATTACHMENT_CONTENT_CANDIDATE_LIMIT,
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
candidates: list[AttachmentCandidateProjection] = []
|
|
254
|
+
for raw_candidate in raw_candidates:
|
|
255
|
+
candidate_count += 1
|
|
256
|
+
candidate = _parse_attachment_candidate_row(
|
|
257
|
+
binding_id=binding_id,
|
|
258
|
+
raw_candidate=raw_candidate,
|
|
259
|
+
)
|
|
260
|
+
if candidate is None:
|
|
261
|
+
failed_messages += 1
|
|
262
|
+
continue
|
|
263
|
+
candidates.append(candidate)
|
|
264
|
+
|
|
265
|
+
for group in _group_candidates_by_fetch_budget(
|
|
266
|
+
candidates,
|
|
267
|
+
max_fetches_per_group=MESSAGES_ATTACHMENTS_GET_BATCH_MAX_SUBREQUESTS,
|
|
268
|
+
):
|
|
269
|
+
fetched_map = _fetch_attachments_for_group(client, group)
|
|
270
|
+
for candidate in group:
|
|
271
|
+
message_rows = _rebuild_attachment_rows(
|
|
272
|
+
binding_id=binding_id,
|
|
273
|
+
candidate=candidate,
|
|
274
|
+
fetched_map=fetched_map,
|
|
275
|
+
)
|
|
276
|
+
if message_rows is None:
|
|
277
|
+
failed_messages += 1
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
successful_messages += 1
|
|
281
|
+
rows_written += len(message_rows)
|
|
282
|
+
yield from message_rows
|
|
283
|
+
|
|
284
|
+
LOGGER.info(
|
|
285
|
+
"gmail.attachments.run_complete binding_id=%s candidates=%d rebuilt_messages=%d failed_messages=%d rows_written=%d",
|
|
286
|
+
binding_id,
|
|
287
|
+
candidate_count,
|
|
288
|
+
successful_messages,
|
|
289
|
+
failed_messages,
|
|
290
|
+
rows_written,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return (attachments_resource,)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _iter_query_rows(
|
|
297
|
+
sql_client: SqlClientBase[Any],
|
|
298
|
+
query: str,
|
|
299
|
+
*params: Any,
|
|
300
|
+
) -> Iterator[dict[str, Any]]:
|
|
301
|
+
with sql_client.execute_query(query, *params) as cursor:
|
|
302
|
+
if cursor.description is None:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
columns = [column[0] for column in cursor.description]
|
|
306
|
+
for row in cursor.fetchall():
|
|
307
|
+
yield dict(zip(columns, row))
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import dlt
|
|
9
|
+
from shared_plugins.naming import (
|
|
10
|
+
dlt_resource_name,
|
|
11
|
+
dlt_source_name,
|
|
12
|
+
plugin_id_from_module,
|
|
13
|
+
)
|
|
14
|
+
from shared_plugins.resources import ctx_dlt_resource
|
|
15
|
+
|
|
16
|
+
from ..models.ctx import MESSAGES_COLUMNS, LabelRow, MessageRow, ProfileRow
|
|
17
|
+
from ..models.translators import (
|
|
18
|
+
labels_to_ctx_models,
|
|
19
|
+
messages_to_ctx_models,
|
|
20
|
+
profiles_to_ctx_models,
|
|
21
|
+
)
|
|
22
|
+
from ..utils.client import MESSAGES_GET_BATCH_MAX_SUBREQUESTS, GmailApiClient
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
27
|
+
JOB = "backfill"
|
|
28
|
+
BACKFILL_MESSAGES_MAX_RESULTS = 50_000
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
|
|
32
|
+
def gmail_backfill_source(
|
|
33
|
+
binding_id: str,
|
|
34
|
+
*,
|
|
35
|
+
client: GmailApiClient,
|
|
36
|
+
) -> tuple[Any, ...]:
|
|
37
|
+
"""Gmail bootstrap over profile, labels, and messages for one binding."""
|
|
38
|
+
|
|
39
|
+
@ctx_dlt_resource(
|
|
40
|
+
name=dlt_resource_name("profile"),
|
|
41
|
+
write_disposition="merge",
|
|
42
|
+
primary_key=("_ctx_binding_id",),
|
|
43
|
+
)
|
|
44
|
+
def profile_resource() -> Iterator[ProfileRow]:
|
|
45
|
+
yield from profiles_to_ctx_models(binding_id, [client.get_profile()])
|
|
46
|
+
|
|
47
|
+
@ctx_dlt_resource(
|
|
48
|
+
name=dlt_resource_name("labels"),
|
|
49
|
+
write_disposition="merge",
|
|
50
|
+
primary_key=("_ctx_binding_id", "id"),
|
|
51
|
+
)
|
|
52
|
+
def labels_resource() -> Iterator[LabelRow]:
|
|
53
|
+
yield from labels_to_ctx_models(binding_id, client.iter_labels())
|
|
54
|
+
|
|
55
|
+
@ctx_dlt_resource(
|
|
56
|
+
name=dlt_resource_name("messages"),
|
|
57
|
+
write_disposition="merge",
|
|
58
|
+
primary_key=("_ctx_binding_id", "id"),
|
|
59
|
+
columns=MESSAGES_COLUMNS,
|
|
60
|
+
)
|
|
61
|
+
def messages_resource() -> Iterator[MessageRow]:
|
|
62
|
+
count = 0
|
|
63
|
+
page_count = 0
|
|
64
|
+
batch_count = 0
|
|
65
|
+
t0 = time.monotonic()
|
|
66
|
+
logger.info(
|
|
67
|
+
"messages_resource: starting message page enumeration (limit=%d)",
|
|
68
|
+
BACKFILL_MESSAGES_MAX_RESULTS,
|
|
69
|
+
)
|
|
70
|
+
for message_ids in client.iter_message_id_pages():
|
|
71
|
+
remaining = BACKFILL_MESSAGES_MAX_RESULTS - count
|
|
72
|
+
if remaining <= 0:
|
|
73
|
+
logger.info(
|
|
74
|
+
"messages_resource: reached message limit=%d, stopping",
|
|
75
|
+
BACKFILL_MESSAGES_MAX_RESULTS,
|
|
76
|
+
)
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
message_ids_to_process = message_ids[:remaining]
|
|
80
|
+
page_count += 1
|
|
81
|
+
elapsed = time.monotonic() - t0
|
|
82
|
+
logger.info(
|
|
83
|
+
"messages_resource: page %d enumerated %d IDs, processing %d IDs (%.1fs elapsed)",
|
|
84
|
+
page_count,
|
|
85
|
+
len(message_ids),
|
|
86
|
+
len(message_ids_to_process),
|
|
87
|
+
elapsed,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
for message_batch in client.iter_message_batches(
|
|
91
|
+
message_ids_to_process,
|
|
92
|
+
batch_size=MESSAGES_GET_BATCH_MAX_SUBREQUESTS,
|
|
93
|
+
):
|
|
94
|
+
batch_count += 1
|
|
95
|
+
count += len(message_batch)
|
|
96
|
+
yield from messages_to_ctx_models(binding_id, message_batch)
|
|
97
|
+
|
|
98
|
+
if batch_count % 10 == 0:
|
|
99
|
+
elapsed = time.monotonic() - t0
|
|
100
|
+
logger.info(
|
|
101
|
+
"messages_resource: fetched %d messages in %d batches across %d pages (%.1fs elapsed)",
|
|
102
|
+
count,
|
|
103
|
+
batch_count,
|
|
104
|
+
page_count,
|
|
105
|
+
elapsed,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if count >= BACKFILL_MESSAGES_MAX_RESULTS:
|
|
109
|
+
logger.info(
|
|
110
|
+
"messages_resource: reached message limit=%d after page %d",
|
|
111
|
+
BACKFILL_MESSAGES_MAX_RESULTS,
|
|
112
|
+
page_count,
|
|
113
|
+
)
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
elapsed = time.monotonic() - t0
|
|
117
|
+
logger.info(
|
|
118
|
+
"messages_resource: finished %d messages across %d pages in %d batches (%.1fs)",
|
|
119
|
+
count,
|
|
120
|
+
page_count,
|
|
121
|
+
batch_count,
|
|
122
|
+
elapsed,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return (
|
|
126
|
+
profile_resource,
|
|
127
|
+
labels_resource,
|
|
128
|
+
messages_resource,
|
|
129
|
+
)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from dlt.destinations.sql_client import SqlClientBase
|
|
8
|
+
from shared_plugins.google_client.http_errors import extract_http_status_code
|
|
9
|
+
from shared_plugins.dlt import destination_has_table
|
|
10
|
+
from shared_plugins.naming import (
|
|
11
|
+
dlt_resource_name,
|
|
12
|
+
dlt_source_name,
|
|
13
|
+
plugin_id_from_module,
|
|
14
|
+
)
|
|
15
|
+
from shared_plugins.resources import ctx_dlt_resource, ctx_dlt_transformer
|
|
16
|
+
from shared_plugins.values import require_non_negative_int
|
|
17
|
+
|
|
18
|
+
from ..models.ctx import (
|
|
19
|
+
MESSAGES_COLUMNS,
|
|
20
|
+
HistoryEventRow,
|
|
21
|
+
LabelRow,
|
|
22
|
+
MessageRow,
|
|
23
|
+
ProfileRow,
|
|
24
|
+
)
|
|
25
|
+
from ..models.translators import (
|
|
26
|
+
extract_changed_ids,
|
|
27
|
+
history_events_to_ctx_models,
|
|
28
|
+
labels_to_ctx_models,
|
|
29
|
+
messages_to_ctx_models,
|
|
30
|
+
profiles_to_ctx_models,
|
|
31
|
+
utc_now_iso,
|
|
32
|
+
)
|
|
33
|
+
from ..utils.client import GmailApiClient
|
|
34
|
+
|
|
35
|
+
HISTORY_CURSOR_KEY = "history_cursor"
|
|
36
|
+
HISTORY_LAST_SYNCED_AT_KEY = "history_last_synced_at"
|
|
37
|
+
HISTORY_SEEDED_AT_KEY = "history_seeded_at"
|
|
38
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
39
|
+
JOB = "history"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def read_max_messages_history_id(
|
|
43
|
+
sql_client: SqlClientBase[Any],
|
|
44
|
+
*,
|
|
45
|
+
binding_id: str,
|
|
46
|
+
) -> int | None:
|
|
47
|
+
if not destination_has_table(sql_client, "messages"):
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
with sql_client.execute_query(
|
|
51
|
+
"""
|
|
52
|
+
SELECT MAX(history_id)
|
|
53
|
+
FROM messages
|
|
54
|
+
WHERE _ctx_binding_id = %s
|
|
55
|
+
""".strip(),
|
|
56
|
+
binding_id,
|
|
57
|
+
) as cursor:
|
|
58
|
+
row = cursor.fetchone()
|
|
59
|
+
if row is None:
|
|
60
|
+
return None
|
|
61
|
+
max_history_id = row[0]
|
|
62
|
+
|
|
63
|
+
if max_history_id is None:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
return require_non_negative_int(max_history_id)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
|
|
70
|
+
def gmail_history_source(
|
|
71
|
+
binding_id: str,
|
|
72
|
+
*,
|
|
73
|
+
client: GmailApiClient,
|
|
74
|
+
) -> tuple[Any, ...]:
|
|
75
|
+
"""Incremental Gmail history polling job with explicit cursor state."""
|
|
76
|
+
|
|
77
|
+
seen_message_ids: set[str] = set()
|
|
78
|
+
|
|
79
|
+
@ctx_dlt_resource(
|
|
80
|
+
name=dlt_resource_name("profile"),
|
|
81
|
+
write_disposition="merge",
|
|
82
|
+
primary_key=("_ctx_binding_id",),
|
|
83
|
+
)
|
|
84
|
+
def profile_snapshot_resource() -> Iterator[ProfileRow]:
|
|
85
|
+
yield from profiles_to_ctx_models(binding_id, [client.get_profile()])
|
|
86
|
+
|
|
87
|
+
@ctx_dlt_resource(
|
|
88
|
+
name=dlt_resource_name("labels"),
|
|
89
|
+
write_disposition="merge",
|
|
90
|
+
primary_key=("_ctx_binding_id", "id"),
|
|
91
|
+
)
|
|
92
|
+
def labels_snapshot_resource() -> Iterator[LabelRow]:
|
|
93
|
+
yield from labels_to_ctx_models(binding_id, client.iter_labels())
|
|
94
|
+
|
|
95
|
+
@ctx_dlt_resource(
|
|
96
|
+
name=dlt_resource_name("history_events"),
|
|
97
|
+
write_disposition="append",
|
|
98
|
+
primary_key=("_ctx_binding_id", "id"),
|
|
99
|
+
)
|
|
100
|
+
def history_events_resource() -> Iterator[HistoryEventRow]:
|
|
101
|
+
source_state = dlt.current.source_state()
|
|
102
|
+
cursor = source_state.get(HISTORY_CURSOR_KEY)
|
|
103
|
+
|
|
104
|
+
if not cursor:
|
|
105
|
+
with dlt.current.pipeline().sql_client() as sql_client:
|
|
106
|
+
seeded_cursor = read_max_messages_history_id(
|
|
107
|
+
sql_client,
|
|
108
|
+
binding_id=binding_id,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if seeded_cursor is None:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
source_state[HISTORY_CURSOR_KEY] = str(seeded_cursor)
|
|
115
|
+
source_state[HISTORY_SEEDED_AT_KEY] = utc_now_iso()
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
start_cursor = str(require_non_negative_int(cursor))
|
|
119
|
+
latest_cursor = start_cursor
|
|
120
|
+
for page in client.iter_history_pages(start_history_id=start_cursor):
|
|
121
|
+
latest_cursor = page.history_id
|
|
122
|
+
yield from history_events_to_ctx_models(binding_id, page.history)
|
|
123
|
+
|
|
124
|
+
source_state[HISTORY_CURSOR_KEY] = latest_cursor
|
|
125
|
+
source_state[HISTORY_LAST_SYNCED_AT_KEY] = utc_now_iso()
|
|
126
|
+
|
|
127
|
+
@ctx_dlt_transformer(
|
|
128
|
+
data_from=history_events_resource,
|
|
129
|
+
name=dlt_resource_name("messages"),
|
|
130
|
+
write_disposition="merge",
|
|
131
|
+
primary_key=("_ctx_binding_id", "id"),
|
|
132
|
+
columns=MESSAGES_COLUMNS,
|
|
133
|
+
)
|
|
134
|
+
def messages_from_history(
|
|
135
|
+
event_record: HistoryEventRow | dict[str, Any],
|
|
136
|
+
) -> Iterator[MessageRow]:
|
|
137
|
+
event_row = (
|
|
138
|
+
event_record
|
|
139
|
+
if isinstance(event_record, HistoryEventRow)
|
|
140
|
+
else HistoryEventRow.model_validate(event_record)
|
|
141
|
+
)
|
|
142
|
+
message_ids, _ = extract_changed_ids(event_row)
|
|
143
|
+
for message_id in sorted(message_ids):
|
|
144
|
+
if message_id in seen_message_ids:
|
|
145
|
+
continue
|
|
146
|
+
seen_message_ids.add(message_id)
|
|
147
|
+
try:
|
|
148
|
+
message = client.get_message(message_id)
|
|
149
|
+
except Exception as exc:
|
|
150
|
+
if extract_http_status_code(exc) == 404:
|
|
151
|
+
continue
|
|
152
|
+
raise
|
|
153
|
+
yield from messages_to_ctx_models(binding_id, [message])
|
|
154
|
+
|
|
155
|
+
return (
|
|
156
|
+
profile_snapshot_resource,
|
|
157
|
+
labels_snapshot_resource,
|
|
158
|
+
history_events_resource,
|
|
159
|
+
messages_from_history,
|
|
160
|
+
)
|
|
File without changes
|