contextbase-plugin-microsoft-mail 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextbase_plugin_microsoft_mail-0.2.6.dist-info/METADATA +14 -0
- contextbase_plugin_microsoft_mail-0.2.6.dist-info/RECORD +18 -0
- contextbase_plugin_microsoft_mail-0.2.6.dist-info/WHEEL +4 -0
- plugin_microsoft_mail/__init__.py +1 -0
- plugin_microsoft_mail/binding_config.py +14 -0
- plugin_microsoft_mail/component.py +189 -0
- plugin_microsoft_mail/defs/__init__.py +0 -0
- plugin_microsoft_mail/defs/defs.yaml +1 -0
- plugin_microsoft_mail/models/__init__.py +1 -0
- plugin_microsoft_mail/models/ctx.py +378 -0
- plugin_microsoft_mail/models/translators.py +193 -0
- plugin_microsoft_mail/plugin.json +7 -0
- plugin_microsoft_mail/sources/__init__.py +1 -0
- plugin_microsoft_mail/sources/attachments.py +407 -0
- plugin_microsoft_mail/sources/sync.py +375 -0
- plugin_microsoft_mail/utils/__init__.py +1 -0
- plugin_microsoft_mail/utils/attachments.py +107 -0
- plugin_microsoft_mail/utils/client.py +245 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Iterator, Mapping
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..utils.client import graph_object_to_payload
|
|
7
|
+
from .ctx import (
|
|
8
|
+
AttachmentContentRow,
|
|
9
|
+
MailFolderRow,
|
|
10
|
+
MessageRow,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _payload(item: object) -> dict[str, Any]:
|
|
15
|
+
return graph_object_to_payload(item)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _list_value(payload: Mapping[str, Any], key: str) -> list[Any]:
|
|
19
|
+
value = payload.get(key)
|
|
20
|
+
if value is None:
|
|
21
|
+
return []
|
|
22
|
+
if not isinstance(value, list):
|
|
23
|
+
raise TypeError(
|
|
24
|
+
f"translator: field {key!r} must be a list, got {type(value).__name__}"
|
|
25
|
+
)
|
|
26
|
+
return list(value)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _dict_value(payload: Mapping[str, Any], key: str) -> dict[str, Any] | None:
|
|
30
|
+
value = payload.get(key)
|
|
31
|
+
if value is None:
|
|
32
|
+
return None
|
|
33
|
+
if not isinstance(value, dict):
|
|
34
|
+
raise TypeError(
|
|
35
|
+
f"translator: field {key!r} must be a dict, got {type(value).__name__}"
|
|
36
|
+
)
|
|
37
|
+
return dict(value)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def mail_folder_rows_to_ctx_models(
|
|
41
|
+
binding_id: str,
|
|
42
|
+
rows: Iterable[object],
|
|
43
|
+
) -> Iterator[MailFolderRow]:
|
|
44
|
+
"""Translate Graph delta mail-folder rows to MailFolderRow instances.
|
|
45
|
+
|
|
46
|
+
`@removed` rows from Graph carry only `id` + the `@removed` marker; they
|
|
47
|
+
become tombstones (`ctx_deleted=True`) so dlt's `hard_delete` deletes the
|
|
48
|
+
matching row at merge time. Live rows pass through with full field set.
|
|
49
|
+
"""
|
|
50
|
+
for row in rows:
|
|
51
|
+
payload = _payload(row)
|
|
52
|
+
additional_data = _dict_value(payload, "additional_data") or {}
|
|
53
|
+
if "@removed" in additional_data or "@removed" in payload:
|
|
54
|
+
yield MailFolderRow(
|
|
55
|
+
ctx_binding_id=binding_id,
|
|
56
|
+
id=payload.get("id"),
|
|
57
|
+
ctx_deleted=True,
|
|
58
|
+
)
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
yield MailFolderRow(
|
|
62
|
+
ctx_binding_id=binding_id,
|
|
63
|
+
id=payload.get("id"),
|
|
64
|
+
odata_type=payload.get("@odata.type"),
|
|
65
|
+
additional_data=additional_data,
|
|
66
|
+
child_folder_count=payload.get("childFolderCount"),
|
|
67
|
+
child_folders=_list_value(payload, "childFolders"),
|
|
68
|
+
display_name=payload.get("displayName"),
|
|
69
|
+
is_hidden=payload.get("isHidden"),
|
|
70
|
+
message_rules=_list_value(payload, "messageRules"),
|
|
71
|
+
messages=_list_value(payload, "messages"),
|
|
72
|
+
multi_value_extended_properties=_list_value(
|
|
73
|
+
payload,
|
|
74
|
+
"multiValueExtendedProperties",
|
|
75
|
+
),
|
|
76
|
+
parent_folder_id=payload.get("parentFolderId"),
|
|
77
|
+
single_value_extended_properties=_list_value(
|
|
78
|
+
payload,
|
|
79
|
+
"singleValueExtendedProperties",
|
|
80
|
+
),
|
|
81
|
+
total_item_count=payload.get("totalItemCount"),
|
|
82
|
+
unread_item_count=payload.get("unreadItemCount"),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def message_rows_to_ctx_models(
|
|
87
|
+
binding_id: str,
|
|
88
|
+
rows: Iterable[object],
|
|
89
|
+
*,
|
|
90
|
+
folder_id: str | None = None,
|
|
91
|
+
) -> Iterator[MessageRow]:
|
|
92
|
+
"""Translate Graph delta message rows to MessageRow instances.
|
|
93
|
+
|
|
94
|
+
`folder_id` is the folder whose delta produced these rows. It's used to
|
|
95
|
+
populate `parent_folder_id` on tombstone rows (`@removed` entries from Graph
|
|
96
|
+
contain only `id` + `@removed` and don't carry the folder).
|
|
97
|
+
"""
|
|
98
|
+
for row in rows:
|
|
99
|
+
payload = _payload(row)
|
|
100
|
+
additional_data = _dict_value(payload, "additional_data") or {}
|
|
101
|
+
if "@removed" in additional_data or "@removed" in payload:
|
|
102
|
+
if folder_id is None:
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
"@removed message row received without folder_id context "
|
|
105
|
+
f"message_id={payload.get('id')!r}"
|
|
106
|
+
)
|
|
107
|
+
yield MessageRow(
|
|
108
|
+
ctx_binding_id=binding_id,
|
|
109
|
+
id=payload.get("id"),
|
|
110
|
+
parent_folder_id=folder_id,
|
|
111
|
+
ctx_deleted=True,
|
|
112
|
+
)
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
yield MessageRow(
|
|
116
|
+
ctx_binding_id=binding_id,
|
|
117
|
+
ctx_source_updated_at=payload.get("lastModifiedDateTime"),
|
|
118
|
+
id=payload.get("id"),
|
|
119
|
+
odata_type=payload.get("@odata.type"),
|
|
120
|
+
etag=payload.get("@odata.etag"),
|
|
121
|
+
additional_data=additional_data,
|
|
122
|
+
attachments=_list_value(payload, "attachments"),
|
|
123
|
+
bcc_recipients=_list_value(payload, "bccRecipients"),
|
|
124
|
+
body=_dict_value(payload, "body"),
|
|
125
|
+
body_preview=payload.get("bodyPreview"),
|
|
126
|
+
categories=_list_value(payload, "categories"),
|
|
127
|
+
cc_recipients=_list_value(payload, "ccRecipients"),
|
|
128
|
+
change_key=payload.get("changeKey"),
|
|
129
|
+
conversation_id=payload.get("conversationId"),
|
|
130
|
+
conversation_index=payload.get("conversationIndex"),
|
|
131
|
+
created_date_time=payload.get("createdDateTime"),
|
|
132
|
+
extensions=_list_value(payload, "extensions"),
|
|
133
|
+
flag=_dict_value(payload, "flag"),
|
|
134
|
+
from_=_dict_value(payload, "from"),
|
|
135
|
+
has_attachments=payload.get("hasAttachments"),
|
|
136
|
+
importance=payload.get("importance"),
|
|
137
|
+
inference_classification=payload.get("inferenceClassification"),
|
|
138
|
+
internet_message_headers=_list_value(
|
|
139
|
+
payload,
|
|
140
|
+
"internetMessageHeaders",
|
|
141
|
+
),
|
|
142
|
+
internet_message_id=payload.get("internetMessageId"),
|
|
143
|
+
is_delivery_receipt_requested=payload.get("isDeliveryReceiptRequested"),
|
|
144
|
+
is_draft=payload.get("isDraft"),
|
|
145
|
+
is_read=payload.get("isRead"),
|
|
146
|
+
is_read_receipt_requested=payload.get("isReadReceiptRequested"),
|
|
147
|
+
last_modified_date_time=payload.get("lastModifiedDateTime"),
|
|
148
|
+
multi_value_extended_properties=_list_value(
|
|
149
|
+
payload,
|
|
150
|
+
"multiValueExtendedProperties",
|
|
151
|
+
),
|
|
152
|
+
parent_folder_id=payload.get("parentFolderId"),
|
|
153
|
+
received_date_time=payload.get("receivedDateTime"),
|
|
154
|
+
reply_to=_list_value(payload, "replyTo"),
|
|
155
|
+
sender=_dict_value(payload, "sender"),
|
|
156
|
+
sent_date_time=payload.get("sentDateTime"),
|
|
157
|
+
single_value_extended_properties=_list_value(
|
|
158
|
+
payload,
|
|
159
|
+
"singleValueExtendedProperties",
|
|
160
|
+
),
|
|
161
|
+
subject=payload.get("subject"),
|
|
162
|
+
to_recipients=_list_value(payload, "toRecipients"),
|
|
163
|
+
unique_body=_dict_value(payload, "uniqueBody"),
|
|
164
|
+
web_link=payload.get("webLink"),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def attachment_content_row_from_graph_payload(
|
|
169
|
+
*,
|
|
170
|
+
binding_id: str,
|
|
171
|
+
message_id: str,
|
|
172
|
+
attachment_payload: Mapping[str, Any],
|
|
173
|
+
file_path: str,
|
|
174
|
+
) -> AttachmentContentRow:
|
|
175
|
+
"""Build an AttachmentContentRow from a Graph attachment object and a
|
|
176
|
+
locally-materialized file path."""
|
|
177
|
+
last_modified = attachment_payload.get("lastModifiedDateTime")
|
|
178
|
+
return AttachmentContentRow(
|
|
179
|
+
ctx_binding_id=binding_id,
|
|
180
|
+
ctx_source_updated_at=last_modified,
|
|
181
|
+
message_id=message_id,
|
|
182
|
+
attachment_id=attachment_payload.get("id"),
|
|
183
|
+
odata_type=attachment_payload.get("@odata.type"),
|
|
184
|
+
media_content_type=attachment_payload.get("@odata.mediaContentType"),
|
|
185
|
+
name=attachment_payload.get("name"),
|
|
186
|
+
content_type=attachment_payload.get("contentType"),
|
|
187
|
+
size=attachment_payload.get("size"),
|
|
188
|
+
is_inline=attachment_payload.get("isInline"),
|
|
189
|
+
content_id=attachment_payload.get("contentId"),
|
|
190
|
+
content_location=attachment_payload.get("contentLocation"),
|
|
191
|
+
last_modified_date_time=last_modified,
|
|
192
|
+
file_path=file_path,
|
|
193
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""DLT sources for Microsoft Mail."""
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from collections.abc import Iterator, Mapping
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Literal, Self
|
|
9
|
+
|
|
10
|
+
import dlt
|
|
11
|
+
from dlt.destinations.sql_client import SqlClientBase
|
|
12
|
+
from pydantic import model_validator
|
|
13
|
+
from shared_plugins.dlt import destination_has_table
|
|
14
|
+
from shared_plugins.models import IdStr, StrictModel
|
|
15
|
+
from shared_plugins.naming import (
|
|
16
|
+
dlt_resource_name,
|
|
17
|
+
dlt_source_name,
|
|
18
|
+
plugin_id_from_module,
|
|
19
|
+
)
|
|
20
|
+
from shared_plugins.resources import ctx_dlt_resource
|
|
21
|
+
|
|
22
|
+
from ..models.ctx import (
|
|
23
|
+
ATTACHMENT_CONTENT_COLUMN_DESCRIPTIONS,
|
|
24
|
+
AttachmentContentRow,
|
|
25
|
+
)
|
|
26
|
+
from ..utils.attachments import (
|
|
27
|
+
FILE_ATTACHMENT_ODATA_TYPE,
|
|
28
|
+
KNOWN_ATTACHMENT_ODATA_TYPES,
|
|
29
|
+
materialize_attachment_payloads,
|
|
30
|
+
)
|
|
31
|
+
from ..utils.client import (
|
|
32
|
+
SyncGraphMailClient,
|
|
33
|
+
graph_object_to_payload,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
37
|
+
JOB = "attachment_content"
|
|
38
|
+
SOURCE_NAME = dlt_source_name(PLUGIN_ID, JOB)
|
|
39
|
+
LOGGER = logging.getLogger(__name__)
|
|
40
|
+
DEFAULT_CANDIDATE_LIMIT = 200
|
|
41
|
+
ATTACHMENT_PREFER_HEADER = 'IdType="ImmutableId"'
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class Candidate:
|
|
46
|
+
action: str # 'materialize' or 'orphan'
|
|
47
|
+
message_id: str
|
|
48
|
+
attachments: list[dict[str, Any]] | None = None # only for 'materialize'
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class AttachmentCandidateProjection(StrictModel):
|
|
52
|
+
action: Literal["materialize", "orphan"]
|
|
53
|
+
message_id: IdStr
|
|
54
|
+
attachments: list[dict[str, Any]] | None = None
|
|
55
|
+
|
|
56
|
+
@model_validator(mode="after")
|
|
57
|
+
def _validate_payload_for_action(self) -> Self:
|
|
58
|
+
if self.action == "materialize" and self.attachments is None:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
"materialize attachment candidate requires attachments payload"
|
|
61
|
+
)
|
|
62
|
+
if self.action == "orphan" and self.attachments is not None:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"orphan attachment candidate must not include attachments payload"
|
|
65
|
+
)
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def to_candidate(self) -> Candidate:
|
|
69
|
+
return Candidate(
|
|
70
|
+
action=self.action,
|
|
71
|
+
message_id=self.message_id,
|
|
72
|
+
attachments=(
|
|
73
|
+
[dict(attachment) for attachment in self.attachments]
|
|
74
|
+
if self.attachments is not None
|
|
75
|
+
else None
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_attachment_candidate(row: Mapping[str, Any]) -> Candidate:
|
|
81
|
+
return AttachmentCandidateProjection.model_validate(dict(row)).to_candidate()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def build_materialize_arm_query() -> str:
|
|
85
|
+
return """
|
|
86
|
+
WITH latest_message AS (
|
|
87
|
+
SELECT DISTINCT ON (_ctx_binding_id, id)
|
|
88
|
+
_ctx_binding_id,
|
|
89
|
+
id,
|
|
90
|
+
attachments
|
|
91
|
+
FROM messages
|
|
92
|
+
WHERE _ctx_binding_id = %s
|
|
93
|
+
ORDER BY _ctx_binding_id, id, last_modified_date_time DESC NULLS LAST
|
|
94
|
+
)
|
|
95
|
+
SELECT
|
|
96
|
+
'materialize'::text AS action,
|
|
97
|
+
m.id AS message_id,
|
|
98
|
+
m.attachments AS attachments
|
|
99
|
+
FROM latest_message AS m
|
|
100
|
+
WHERE (
|
|
101
|
+
SELECT count(*)
|
|
102
|
+
FROM jsonb_array_elements(COALESCE(m.attachments, '[]'::jsonb)) AS a
|
|
103
|
+
WHERE COALESCE(a->>'@odata.type', '') NOT IN (
|
|
104
|
+
'#microsoft.graph.referenceAttachment',
|
|
105
|
+
'#microsoft.graph.itemAttachment'
|
|
106
|
+
)
|
|
107
|
+
) <> (
|
|
108
|
+
SELECT count(*)
|
|
109
|
+
FROM attachment_content AS c
|
|
110
|
+
WHERE c._ctx_binding_id = m._ctx_binding_id
|
|
111
|
+
AND c.message_id = m.id
|
|
112
|
+
)
|
|
113
|
+
""".strip()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def build_orphan_arm_query() -> str:
|
|
117
|
+
return """
|
|
118
|
+
SELECT
|
|
119
|
+
'orphan'::text AS action,
|
|
120
|
+
c.message_id,
|
|
121
|
+
NULL::jsonb AS attachments
|
|
122
|
+
FROM attachment_content AS c
|
|
123
|
+
LEFT JOIN messages AS m
|
|
124
|
+
ON m._ctx_binding_id = c._ctx_binding_id
|
|
125
|
+
AND m.id = c.message_id
|
|
126
|
+
WHERE c._ctx_binding_id = %s
|
|
127
|
+
AND m.id IS NULL
|
|
128
|
+
GROUP BY c.message_id
|
|
129
|
+
""".strip()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def build_bootstrap_materialize_arm_query() -> str:
|
|
133
|
+
"""Materialize-arm variant for the case where `attachment_content` table
|
|
134
|
+
doesn't exist yet (first run after a clean reseed). Hardcodes the
|
|
135
|
+
existing-count subquery to 0 instead of selecting from `attachment_content`,
|
|
136
|
+
so the candidate query doesn't fail on a missing relation.
|
|
137
|
+
"""
|
|
138
|
+
return """
|
|
139
|
+
WITH latest_message AS (
|
|
140
|
+
SELECT DISTINCT ON (_ctx_binding_id, id)
|
|
141
|
+
_ctx_binding_id,
|
|
142
|
+
id,
|
|
143
|
+
attachments
|
|
144
|
+
FROM messages
|
|
145
|
+
WHERE _ctx_binding_id = %s
|
|
146
|
+
ORDER BY _ctx_binding_id, id, last_modified_date_time DESC NULLS LAST
|
|
147
|
+
)
|
|
148
|
+
SELECT
|
|
149
|
+
'materialize'::text AS action,
|
|
150
|
+
m.id AS message_id,
|
|
151
|
+
m.attachments AS attachments
|
|
152
|
+
FROM latest_message AS m
|
|
153
|
+
WHERE (
|
|
154
|
+
SELECT count(*)
|
|
155
|
+
FROM jsonb_array_elements(m.attachments) AS a
|
|
156
|
+
WHERE COALESCE(a->>'@odata.type', '') NOT IN (
|
|
157
|
+
'#microsoft.graph.referenceAttachment',
|
|
158
|
+
'#microsoft.graph.itemAttachment'
|
|
159
|
+
)
|
|
160
|
+
) > 0
|
|
161
|
+
""".strip()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def build_combined_candidate_query(*, limit: int) -> str:
|
|
165
|
+
materialize = build_materialize_arm_query()
|
|
166
|
+
orphan = build_orphan_arm_query()
|
|
167
|
+
return f"""
|
|
168
|
+
WITH candidates AS (
|
|
169
|
+
{materialize}
|
|
170
|
+
UNION ALL
|
|
171
|
+
{orphan}
|
|
172
|
+
)
|
|
173
|
+
SELECT action, message_id, attachments
|
|
174
|
+
FROM candidates
|
|
175
|
+
ORDER BY message_id ASC
|
|
176
|
+
LIMIT %s
|
|
177
|
+
""".strip()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def iter_candidates(
|
|
181
|
+
sql_client: SqlClientBase[Any],
|
|
182
|
+
*,
|
|
183
|
+
binding_id: str,
|
|
184
|
+
limit: int,
|
|
185
|
+
) -> list[Candidate]:
|
|
186
|
+
if not destination_has_table(sql_client, "messages"):
|
|
187
|
+
raise RuntimeError(
|
|
188
|
+
"messages table does not exist yet; run microsoft-mail-dlt-sync first"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if not destination_has_table(sql_client, "attachment_content"):
|
|
192
|
+
# Bootstrap: orphan arm has no rows AND we can't reference
|
|
193
|
+
# attachment_content yet. Use the bootstrap materialize-arm SQL.
|
|
194
|
+
query = (
|
|
195
|
+
build_bootstrap_materialize_arm_query()
|
|
196
|
+
+ "\nORDER BY message_id ASC\nLIMIT %s"
|
|
197
|
+
)
|
|
198
|
+
params = (binding_id, limit)
|
|
199
|
+
else:
|
|
200
|
+
query = build_combined_candidate_query(limit=limit)
|
|
201
|
+
params = (binding_id, binding_id, limit)
|
|
202
|
+
|
|
203
|
+
candidates: list[Candidate] = []
|
|
204
|
+
with sql_client.execute_query(query, *params) as cursor:
|
|
205
|
+
if cursor.description is None:
|
|
206
|
+
return candidates
|
|
207
|
+
columns = [c[0] for c in cursor.description]
|
|
208
|
+
for row in cursor.fetchall():
|
|
209
|
+
raw = dict(zip(columns, row))
|
|
210
|
+
candidates.append(parse_attachment_candidate(raw))
|
|
211
|
+
return candidates
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _materializable_attachments(
|
|
215
|
+
attachments: list[dict[str, Any]],
|
|
216
|
+
) -> list[dict[str, Any]]:
|
|
217
|
+
"""Filter to file attachments. Reference and item attachments produce no
|
|
218
|
+
row in `attachment_content`. An unknown `@odata.type` is treated as a loud
|
|
219
|
+
error per spec §8 — Graph's attachment subtype set is closed."""
|
|
220
|
+
materializable: list[dict[str, Any]] = []
|
|
221
|
+
for a in attachments:
|
|
222
|
+
odata_type = a.get("@odata.type")
|
|
223
|
+
if odata_type not in KNOWN_ATTACHMENT_ODATA_TYPES:
|
|
224
|
+
raise RuntimeError(
|
|
225
|
+
f"unknown attachment @odata.type {odata_type!r}; "
|
|
226
|
+
f"expected one of {sorted(KNOWN_ATTACHMENT_ODATA_TYPES)}"
|
|
227
|
+
)
|
|
228
|
+
if odata_type == FILE_ATTACHMENT_ODATA_TYPE:
|
|
229
|
+
materializable.append(a)
|
|
230
|
+
return materializable
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _full_attachment_payload(obj: Any) -> dict[str, Any]:
|
|
234
|
+
"""Build the materialization payload for a Graph FileAttachment object.
|
|
235
|
+
|
|
236
|
+
Kiota's Python deserializer stores `FileAttachment.content_bytes` as the
|
|
237
|
+
ASCII bytes of the base64 string from Graph's wire JSON — not as the
|
|
238
|
+
decoded file bytes. Routing it through `graph_object_to_payload` (which
|
|
239
|
+
uses Kiota's JSON writer) base64-encodes those ASCII bytes a second time,
|
|
240
|
+
so a single `b64decode` downstream would produce the base64 string back
|
|
241
|
+
instead of the real file bytes. Override `contentBytes` with the original
|
|
242
|
+
base64 string so the single `b64decode` in `materialize_attachment_payloads`
|
|
243
|
+
yields real bytes. (Verified empirically — see scratch/smoke_real_graph.py.)
|
|
244
|
+
"""
|
|
245
|
+
payload = graph_object_to_payload(obj)
|
|
246
|
+
kiota_content_bytes = getattr(obj, "content_bytes", None)
|
|
247
|
+
if isinstance(kiota_content_bytes, (bytes, bytearray)):
|
|
248
|
+
payload["contentBytes"] = bytes(kiota_content_bytes).decode("ascii")
|
|
249
|
+
return payload
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def fetch_and_emit_for_message(
|
|
253
|
+
*,
|
|
254
|
+
binding_id: str,
|
|
255
|
+
client: SyncGraphMailClient,
|
|
256
|
+
candidate: Candidate,
|
|
257
|
+
) -> Iterator[AttachmentContentRow]:
|
|
258
|
+
if candidate.attachments is None:
|
|
259
|
+
raise RuntimeError(
|
|
260
|
+
f"materialize candidate without attachments payload message_id={candidate.message_id}"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
full_payloads: list[dict[str, Any]] = []
|
|
264
|
+
for raw_attachment in _materializable_attachments(candidate.attachments):
|
|
265
|
+
attachment_id = raw_attachment.get("id")
|
|
266
|
+
if not isinstance(attachment_id, str):
|
|
267
|
+
raise RuntimeError(
|
|
268
|
+
"attachment in messages.attachments has no id "
|
|
269
|
+
f"message_id={candidate.message_id} payload_keys={sorted(raw_attachment)}"
|
|
270
|
+
)
|
|
271
|
+
full = client.get_attachment_full(
|
|
272
|
+
message_id=candidate.message_id,
|
|
273
|
+
attachment_id=attachment_id,
|
|
274
|
+
prefer_header=ATTACHMENT_PREFER_HEADER,
|
|
275
|
+
)
|
|
276
|
+
full_payloads.append(_full_attachment_payload(full))
|
|
277
|
+
|
|
278
|
+
rows = materialize_attachment_payloads(
|
|
279
|
+
binding_id=binding_id,
|
|
280
|
+
message_id=candidate.message_id,
|
|
281
|
+
attachment_payloads=full_payloads,
|
|
282
|
+
)
|
|
283
|
+
yield from rows
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
TOMBSTONE_ATTACHMENT_ID_PREFIX = "_ctx_tombstone:"
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _generate_tombstone_attachment_id() -> str:
|
|
290
|
+
"""Build a unique sentinel `attachment_id` for a tombstone row.
|
|
291
|
+
|
|
292
|
+
The prefix marks intent; the random uuid4 suffix prevents any conceivable
|
|
293
|
+
collision with a real Graph attachment id and guarantees uniqueness
|
|
294
|
+
across multiple tombstones in the same merge.
|
|
295
|
+
"""
|
|
296
|
+
return f"{TOMBSTONE_ATTACHMENT_ID_PREFIX}{uuid.uuid4()}"
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def emit_orphan_tombstone(
|
|
300
|
+
*,
|
|
301
|
+
binding_id: str,
|
|
302
|
+
candidate: Candidate,
|
|
303
|
+
) -> AttachmentContentRow:
|
|
304
|
+
"""Build a tombstone row that dlt will use to delete all attachment_content
|
|
305
|
+
rows matching merge_key=(_ctx_binding_id, message_id).
|
|
306
|
+
|
|
307
|
+
The PK includes `attachment_id` (Postgres requires PK columns NOT NULL),
|
|
308
|
+
so the tombstone carries a sentinel id with the form
|
|
309
|
+
`_ctx_tombstone:<uuid4>`. dlt's `hard_delete` on `_ctx_deleted` deletes
|
|
310
|
+
the sentinel-bearing row at the end of the merge, so the sentinel never
|
|
311
|
+
persists in the destination."""
|
|
312
|
+
return AttachmentContentRow(
|
|
313
|
+
ctx_binding_id=binding_id,
|
|
314
|
+
message_id=candidate.message_id,
|
|
315
|
+
attachment_id=_generate_tombstone_attachment_id(),
|
|
316
|
+
ctx_deleted=True,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@dlt.source(name=SOURCE_NAME)
|
|
321
|
+
def microsoft_mail_attachment_source(
|
|
322
|
+
binding_id: str,
|
|
323
|
+
*,
|
|
324
|
+
client: SyncGraphMailClient,
|
|
325
|
+
) -> tuple[Any, ...]:
|
|
326
|
+
@ctx_dlt_resource(
|
|
327
|
+
name=dlt_resource_name("attachment_content"),
|
|
328
|
+
write_disposition={"disposition": "merge", "strategy": "delete-insert"},
|
|
329
|
+
primary_key=("_ctx_binding_id", "message_id", "attachment_id"),
|
|
330
|
+
merge_key=("_ctx_binding_id", "message_id"),
|
|
331
|
+
columns={
|
|
332
|
+
**ATTACHMENT_CONTENT_COLUMN_DESCRIPTIONS,
|
|
333
|
+
# file_path is required on live rows but NULL on tombstones
|
|
334
|
+
# (ctx_deleted=True). dlt infers schema from first-observed data,
|
|
335
|
+
# so we declare it nullable upfront to match the pydantic Optional
|
|
336
|
+
# type. attachment_id stays NOT NULL because it's part of the PK
|
|
337
|
+
# — tombstones carry a sentinel value (TOMBSTONE_ATTACHMENT_ID).
|
|
338
|
+
"file_path": {"nullable": True},
|
|
339
|
+
"_ctx_deleted": {"data_type": "bool", "hard_delete": True},
|
|
340
|
+
},
|
|
341
|
+
)
|
|
342
|
+
def attachment_content() -> Iterator[AttachmentContentRow]:
|
|
343
|
+
# `schema_name=SOURCE_NAME` avoids dlt's `Schema(pipeline_name)`
|
|
344
|
+
# fallback inside `_get_schema_or_create`. The pipeline_name for a
|
|
345
|
+
# per-binding attachment_content pipeline routinely exceeds dlt's
|
|
346
|
+
# 64-char Schema-name limit, and the source schema is what we
|
|
347
|
+
# actually want here. `run_dlt_pipeline` pre-registers an empty
|
|
348
|
+
# schema with this name on cold start so the lookup succeeds.
|
|
349
|
+
with dlt.current.pipeline().sql_client(
|
|
350
|
+
schema_name=SOURCE_NAME,
|
|
351
|
+
) as sql_client:
|
|
352
|
+
candidates = iter_candidates(
|
|
353
|
+
sql_client,
|
|
354
|
+
binding_id=binding_id,
|
|
355
|
+
limit=DEFAULT_CANDIDATE_LIMIT,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
run_started = time.monotonic()
|
|
359
|
+
materialized_messages = 0
|
|
360
|
+
materialized_rows = 0
|
|
361
|
+
orphaned_messages = 0
|
|
362
|
+
|
|
363
|
+
for candidate in candidates:
|
|
364
|
+
if candidate.action == "orphan":
|
|
365
|
+
orphaned_messages += 1
|
|
366
|
+
yield emit_orphan_tombstone(binding_id=binding_id, candidate=candidate)
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
if candidate.action != "materialize":
|
|
370
|
+
raise RuntimeError(f"unknown candidate action {candidate.action!r}")
|
|
371
|
+
|
|
372
|
+
# Pre-filter: when the materialize arm fires for a message
|
|
373
|
+
# whose materializable count is zero (e.g. all file attachments
|
|
374
|
+
# were removed but reference attachments remain, or the
|
|
375
|
+
# attachments array is empty entirely), there's nothing to
|
|
376
|
+
# fetch. Emit a tombstone instead — dlt's delete-insert merge
|
|
377
|
+
# is a no-op when zero rows are emitted for a merge_key, so
|
|
378
|
+
# leftover rows would persist without the tombstone.
|
|
379
|
+
materializable = _materializable_attachments(candidate.attachments or [])
|
|
380
|
+
if not materializable:
|
|
381
|
+
orphaned_messages += 1
|
|
382
|
+
yield emit_orphan_tombstone(binding_id=binding_id, candidate=candidate)
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
rows = list(
|
|
386
|
+
fetch_and_emit_for_message(
|
|
387
|
+
binding_id=binding_id,
|
|
388
|
+
client=client,
|
|
389
|
+
candidate=candidate,
|
|
390
|
+
)
|
|
391
|
+
)
|
|
392
|
+
materialized_messages += 1
|
|
393
|
+
materialized_rows += len(rows)
|
|
394
|
+
yield from rows
|
|
395
|
+
|
|
396
|
+
elapsed = time.monotonic() - run_started
|
|
397
|
+
LOGGER.info(
|
|
398
|
+
"microsoft_mail.attachment_content.run_complete "
|
|
399
|
+
"candidates=%d materialized_messages=%d rows=%d orphaned_messages=%d elapsed=%.3fs",
|
|
400
|
+
len(candidates),
|
|
401
|
+
materialized_messages,
|
|
402
|
+
materialized_rows,
|
|
403
|
+
orphaned_messages,
|
|
404
|
+
elapsed,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return (attachment_content,)
|