PyPI - contextbase-plugin-gmail - Versions diffs - 0.2.6__py3-none-any.whl - Mend

contextbase-plugin-gmail 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

contextbase_plugin_gmail-0.2.6.dist-info/METADATA +13 -0
contextbase_plugin_gmail-0.2.6.dist-info/RECORD +21 -0
contextbase_plugin_gmail-0.2.6.dist-info/WHEEL +4 -0
plugin_gmail/__init__.py +0 -0
plugin_gmail/binding_config.py +13 -0
plugin_gmail/component.py +269 -0
plugin_gmail/defs/__init__.py +0 -0
plugin_gmail/defs/defs.yaml +1 -0
plugin_gmail/models/__init__.py +0 -0
plugin_gmail/models/ctx.py +132 -0
plugin_gmail/models/ingress.py +185 -0
plugin_gmail/models/translators.py +470 -0
plugin_gmail/models/types.py +12 -0
plugin_gmail/plugin.json +9 -0
plugin_gmail/sources/__init__.py +0 -0
plugin_gmail/sources/attachments.py +307 -0
plugin_gmail/sources/backfill.py +129 -0
plugin_gmail/sources/history.py +160 -0
plugin_gmail/utils/__init__.py +0 -0
plugin_gmail/utils/attachments.py +251 -0
plugin_gmail/utils/client.py +494 -0

plugin_gmail/models/translators.py ADDED Viewed

@@ -0,0 +1,470 @@
+from __future__ import annotations
+import base64
+import binascii
+import codecs
+import re
+from collections.abc import Iterable, Iterator, Mapping
+from datetime import datetime, timezone
+from email.utils import parsedate_to_datetime
+from typing import Any
+from urllib.parse import unquote
+from shared_plugins.values import non_empty_string
+from .ctx import (
+    AttachmentCandidateProjection,
+    HistoryEventRow,
+    LabelRow,
+    MessageRow,
+    ProfileRow,
+)
+from .ingress import (
+    GmailAttachmentIngress,
+    GmailHistoryRecordIngress,
+    GmailLabelIngress,
+    GmailMessageIngress,
+    GmailMessagePartHeaderIngress,
+    GmailMessagePartIngress,
+    GmailProfileIngress,
+)
+_SPECIAL_ATTACHMENT_MIME_TYPES = {
+    "message/delivery-status",
+    "message/rfc822",
+}
+_SUPPORTED_BODY_MIME_TYPES = {
+    "text/plain",
+    "text/html",
+}
+class Base64UrlDecodeError(ValueError):
+    """Raised when a non-empty base64url payload cannot be decoded."""
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def profiles_to_ctx_models(
+    binding_id: str,
+    profiles: Iterable[GmailProfileIngress],
+) -> Iterator[ProfileRow]:
+    for profile in profiles:
+        yield ProfileRow(
+            ctx_binding_id=binding_id,
+            email_address=profile.email_address,
+            messages_total=profile.messages_total,
+            threads_total=profile.threads_total,
+            history_id=profile.history_id,
+        )
+def labels_to_ctx_models(
+    binding_id: str,
+    labels: Iterable[GmailLabelIngress],
+) -> Iterator[LabelRow]:
+    for label in labels:
+        yield LabelRow(
+            ctx_binding_id=binding_id,
+            id=label.id,
+            name=label.name,
+            type=label.type,
+            message_list_visibility=label.message_list_visibility,
+            label_list_visibility=label.label_list_visibility,
+            messages_total=label.messages_total,
+            messages_unread=label.messages_unread,
+            threads_total=label.threads_total,
+            threads_unread=label.threads_unread,
+            color=label.color.model_dump(by_alias=True) if label.color else None,
+        )
+def history_events_to_ctx_models(
+    binding_id: str,
+    history_events: Iterable[GmailHistoryRecordIngress],
+) -> Iterator[HistoryEventRow]:
+    for event in history_events:
+        raw_event = event.model_dump(by_alias=True)
+        yield HistoryEventRow(
+            ctx_binding_id=binding_id,
+            id=event.id,
+            messages=raw_event.get("messages", []),
+            messages_added=raw_event.get("messagesAdded", []),
+            messages_deleted=raw_event.get("messagesDeleted", []),
+            labels_added=raw_event.get("labelsAdded", []),
+            labels_removed=raw_event.get("labelsRemoved", []),
+        )
+def messages_to_ctx_models(
+    binding_id: str,
+    messages: Iterable[GmailMessageIngress],
+) -> Iterator[MessageRow]:
+    for message in messages:
+        payload = message.payload
+        headers = payload.headers if payload else []
+        body_text, body_html = _extract_bodies(payload, message_id=message.id)
+        attachments = _extract_attachments(payload)
+        yield MessageRow(
+            ctx_binding_id=binding_id,
+            ctx_source_updated_at=_internal_date_to_datetime(message.internal_date),
+            id=message.id,
+            thread_id=message.thread_id,
+            label_ids=list(message.label_ids),
+            snippet=message.snippet,
+            history_id=message.history_id,
+            internal_date=message.internal_date,
+            size_estimate=message.size_estimate,
+            subject=_extract_header(headers, "Subject"),
+            from_address=_extract_header(headers, "From"),
+            to_addresses=_extract_header(headers, "To"),
+            cc_addresses=_extract_header(headers, "Cc"),
+            bcc_addresses=_extract_header(headers, "Bcc"),
+            reply_to=_extract_header(headers, "Reply-To"),
+            message_id_header=_extract_header(headers, "Message-Id"),
+            in_reply_to=_extract_header(headers, "In-Reply-To"),
+            references_header=_extract_header(headers, "References"),
+            date=_parse_date_header(_extract_header(headers, "Date")),
+            body_text=body_text,
+            body_html=body_html,
+            mime_type=payload.mime_type if payload else None,
+            classification_label_values=list(message.classification_label_values),
+            attachment_count=len(attachments),
+            attachments=[item.model_dump(exclude_none=True) for item in attachments],
+        )
+def attachment_candidate_rows_to_ctx_models(
+    rows: Iterable[Mapping[str, Any]],
+) -> Iterator[AttachmentCandidateProjection]:
+    for row in rows:
+        yield AttachmentCandidateProjection.model_validate(dict(row))
+def _extract_header(
+    headers: list[GmailMessagePartHeaderIngress],
+    name: str,
+) -> str | None:
+    target_name = name.casefold()
+    for header in headers:
+        if header.name.casefold() == target_name:
+            return header.value
+    return None
+def decode_base64url_bytes(value: str | None) -> bytes | None:
+    if value is None or not isinstance(value, str):
+        return None
+    encoded = value.strip()
+    if encoded == "":
+        return b""
+    padded = encoded + "=" * ((4 - len(encoded) % 4) % 4)
+    try:
+        return base64.b64decode(padded, altchars=b"-_", validate=True)
+    except (binascii.Error, ValueError) as exc:
+        raise Base64UrlDecodeError("Invalid base64url payload.") from exc
+def _resolve_mime_charset(charset: str | None) -> str:
+    normalized = non_empty_string(charset)
+    if normalized is None:
+        return "utf-8"
+    try:
+        return codecs.lookup(normalized).name
+    except LookupError as exc:
+        raise ValueError(f"Unsupported MIME charset: {normalized}") from exc
+def _decode_base64url_text(
+    value: str | None,
+    *,
+    charset: str | None = None,
+) -> str | None:
+    decoded = decode_base64url_bytes(value)
+    if decoded is None or decoded == b"":
+        return None
+    return decoded.decode(_resolve_mime_charset(charset), errors="replace")
+def _walk_mime_parts(
+    part: GmailMessagePartIngress,
+    *,
+    mime_path: str,
+) -> list[tuple[GmailMessagePartIngress, str]]:
+    walked: list[tuple[GmailMessagePartIngress, str]] = [(part, mime_path)]
+    for index, child in enumerate(part.parts):
+        child_path = f"{mime_path}.{index}"
+        walked.extend(_walk_mime_parts(child, mime_path=child_path))
+    return walked
+def _extract_bodies(
+    part: GmailMessagePartIngress | None,
+    *,
+    message_id: str | None = None,
+) -> tuple[str | None, str | None]:
+    if part is None:
+        return None, None
+    text_chunks: list[str] = []
+    html_chunks: list[str] = []
+    for mime_part, mime_path in _walk_mime_parts(part, mime_path="0"):
+        mime_type = non_empty_string(mime_part.mime_type)
+        if mime_type not in _SUPPORTED_BODY_MIME_TYPES:
+            continue
+        body_data = mime_part.body.data if mime_part.body else None
+        charset = _extract_mime_part_charset(mime_part.headers)
+        try:
+            decoded = _decode_base64url_text(body_data, charset=charset)
+        except (Base64UrlDecodeError, ValueError) as exc:
+            raise RuntimeError(
+                "Failed to decode Gmail MIME body "
+                f"(message_id={message_id or '-'}, mime_path={mime_path}, "
+                f"mime_type={mime_type}, charset={charset or 'utf-8'}): {exc}"
+            ) from exc
+        if decoded is None:
+            continue
+        if mime_type == "text/plain":
+            text_chunks.append(decoded)
+        elif mime_type == "text/html":
+            html_chunks.append(decoded)
+    body_text = "\n".join(text_chunks).strip() if text_chunks else None
+    body_html = "\n".join(html_chunks).strip() if html_chunks else None
+    return body_text or None, body_html or None
+def _normalize_attachment_filename(value: str | None) -> str | None:
+    normalized = non_empty_string(value)
+    if normalized in {None, ".", ".."}:
+        return None
+    if normalized is not None and (
+        "/" in normalized or "\\" in normalized or "\x00" in normalized
+    ):
+        return None
+    return normalized
+def _decode_header_value(value: str) -> str:
+    cleaned = value.strip()
+    if (
+        (cleaned.startswith('"') and cleaned.endswith('"'))
+        or (cleaned.startswith("'") and cleaned.endswith("'"))
+    ) and len(cleaned) >= 2:
+        cleaned = cleaned[1:-1]
+    return cleaned
+def _decode_extended_header_value(value: str) -> str:
+    cleaned = _decode_header_value(value)
+    parts = cleaned.split("''", maxsplit=1)
+    encoded = parts[1] if len(parts) == 2 else cleaned
+    return unquote(encoded)
+def _extract_header_param(value: str, parameter: str) -> str | None:
+    star_match = re.search(rf"{re.escape(parameter)}\\*=([^;]+)", value, re.IGNORECASE)
+    if star_match:
+        return _decode_extended_header_value(star_match.group(1))
+    match = re.search(rf"{re.escape(parameter)}=([^;]+)", value, re.IGNORECASE)
+    if match:
+        return _decode_header_value(match.group(1))
+    return None
+def _extract_mime_part_charset(
+    headers: list[GmailMessagePartHeaderIngress],
+) -> str | None:
+    content_type = _extract_header(headers, "Content-Type")
+    if content_type is None:
+        return None
+    return non_empty_string(_extract_header_param(content_type, "charset"))
+def _extract_filename_from_part_headers(headers: Mapping[str, str]) -> str | None:
+    content_disposition = headers.get("content-disposition")
+    if content_disposition:
+        filename = _extract_header_param(content_disposition, "filename")
+        if filename:
+            return filename
+    content_type = headers.get("content-type")
+    if content_type:
+        filename = _extract_header_param(content_type, "name")
+        if filename:
+            return filename
+    content_description = headers.get("content-description")
+    if content_description:
+        return content_description
+    return None
+def _extract_part_headers(
+    headers: list[GmailMessagePartHeaderIngress],
+) -> dict[str, str]:
+    return {header.name.lower(): header.value for header in headers}
+def _normalize_content_id(value: str | None) -> str | None:
+    content_id = non_empty_string(value)
+    if content_id is None:
+        return None
+    return content_id.removeprefix("<").removesuffix(">") or None
+def _normalize_content_disposition(value: str | None) -> str | None:
+    disposition = non_empty_string(value)
+    if disposition is None:
+        return None
+    head = disposition.split(";", maxsplit=1)[0].strip()
+    return head or None
+def canonicalize_part_id(
+    *,
+    source_part_id: str | None,
+    attachment_id: str | None,
+    mime_path: str,
+) -> str:
+    part_id = non_empty_string(source_part_id)
+    if part_id is not None:
+        return part_id
+    if attachment_id is not None:
+        return f"aid:{attachment_id}"
+    return f"path:{mime_path}"
+def _extract_attachments(
+    part: GmailMessagePartIngress | None,
+) -> list[GmailAttachmentIngress]:
+    if part is None:
+        return []
+    attachments: list[GmailAttachmentIngress] = []
+    for mime_part, mime_path in _walk_mime_parts(part, mime_path="0"):
+        body = mime_part.body
+        attachment_id = non_empty_string(body.attachment_id if body else None)
+        inline_data = non_empty_string(body.data if body else None)
+        part_headers = _extract_part_headers(mime_part.headers)
+        filename = _normalize_attachment_filename(mime_part.filename)
+        if filename is None:
+            filename = _normalize_attachment_filename(
+                _extract_filename_from_part_headers(part_headers)
+            )
+        is_candidate = (
+            filename is not None
+            or attachment_id is not None
+            or mime_part.mime_type in _SPECIAL_ATTACHMENT_MIME_TYPES
+        )
+        if not is_candidate:
+            continue
+        if inline_data is None and attachment_id is None:
+            continue
+        part_id = canonicalize_part_id(
+            source_part_id=mime_part.part_id,
+            attachment_id=attachment_id,
+            mime_path=mime_path,
+        )
+        attachments.append(
+            GmailAttachmentIngress.model_validate(
+                {
+                    "attachment_id": attachment_id,
+                    "inline_data_b64url": (
+                        inline_data if attachment_id is None else None
+                    ),
+                    "part_id": part_id,
+                    "filename": filename,
+                    "mime_type": mime_part.mime_type,
+                    "size": body.size if body else None,
+                    "content_disposition": _normalize_content_disposition(
+                        part_headers.get("content-disposition")
+                    ),
+                    "content_id": _normalize_content_id(part_headers.get("content-id")),
+                }
+            )
+        )
+    return attachments
+def _parse_date_header(value: str | None) -> datetime | None:
+    header_value = non_empty_string(value)
+    if header_value is None:
+        return None
+    try:
+        parsed = parsedate_to_datetime(header_value)
+    except (TypeError, ValueError):
+        return None
+    if parsed is None:
+        return None
+    if parsed.tzinfo is None:
+        return parsed.replace(tzinfo=timezone.utc)
+    return parsed
+def _internal_date_to_datetime(value: int | None) -> datetime | None:
+    if value is None:
+        return None
+    try:
+        return datetime.fromtimestamp(value / 1000.0, tz=timezone.utc)
+    except (OverflowError, OSError, ValueError):
+        return None
+def extract_changed_ids(
+    history_event: HistoryEventRow,
+) -> tuple[set[str], set[str]]:
+    message_ids: set[str] = set()
+    thread_ids: set[str] = set()
+    def push(message: Mapping[str, Any]) -> None:
+        message_id = message.get("id")
+        thread_id = message.get("threadId")
+        if message_id:
+            message_ids.add(str(message_id))
+        if thread_id:
+            thread_ids.add(str(thread_id))
+    for message in history_event.messages:
+        if isinstance(message, Mapping):
+            push(message)
+    history_wrappers = (
+        history_event.messages_added,
+        history_event.messages_deleted,
+        history_event.labels_added,
+        history_event.labels_removed,
+    )
+    for wrappers in history_wrappers:
+        for wrapper in wrappers:
+            if not isinstance(wrapper, Mapping):
+                continue
+            message = wrapper.get("message")
+            if isinstance(message, Mapping):
+                push(message)
+    return message_ids, thread_ids

plugin_gmail/models/types.py ADDED Viewed

@@ -0,0 +1,12 @@
+from __future__ import annotations
+from typing import Annotated
+from pydantic import Field
+from shared_plugins.models import IdStr
+MessageId = IdStr
+ThreadId = IdStr
+LabelId = IdStr
+NonNegativeInt = Annotated[int, Field(ge=0, strict=True)]
+HistoryId = NonNegativeInt

plugin_gmail/plugin.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+	"auth": {
+		"provider_id": "google",
+		"scopes": ["https://www.googleapis.com/auth/gmail.readonly"],
+		"type": "oauth"
+	},
+	"mode": "dagster",
+	"plugin_id": "gmail"
+}

plugin_gmail/sources/__init__.py ADDED Viewed

File without changes