PyPI - arcade-google-docs - Versions diffs - 4.3.1__py3-none-any.whl → 5.0.0__py3-none-any.whl - Mend

arcade-google-docs 4.3.1py3-none-any.whl → 5.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

arcade_google_docs/__init__.py +6 -0
arcade_google_docs/doc_to_html.py +195 -4
arcade_google_docs/doc_to_markdown.py +181 -3
arcade_google_docs/docmd.py +115 -36
arcade_google_docs/models/responses.py +143 -0
arcade_google_docs/tools/__init__.py +7 -1
arcade_google_docs/tools/edit_agent/utils.py +3 -1
arcade_google_docs/tools/get.py +64 -7
arcade_google_docs/tools/search.py +33 -33
arcade_google_docs/utils.py +262 -1
{arcade_google_docs-4.3.1.dist-info → arcade_google_docs-5.0.0.dist-info}/METADATA +4 -4
{arcade_google_docs-4.3.1.dist-info → arcade_google_docs-5.0.0.dist-info}/RECORD +13 -12
{arcade_google_docs-4.3.1.dist-info → arcade_google_docs-5.0.0.dist-info}/WHEEL +0 -0

arcade_google_docs/docmd.py CHANGED Viewed

@@ -54,12 +54,15 @@ Example DocMD as a string:
 from collections.abc import Callable
 from enum import Enum
+from arcade_tdk.errors import RetryableToolError
 from pydantic import BaseModel
 from arcade_google_docs.models.document import (
     Document,
     NamedStyleType,
     Paragraph,
+    StructuralElement,
+    Tab,
     Table,
     TextStyle,
 )
@@ -188,10 +191,9 @@ class DocMD(BaseModel):
         )
-def build_docmd(document: Document) -> DocMD:  # noqa: C901
+def build_docmd(document: Document, tab_id: str | None = None) -> DocMD:
     doc_id = document.documentId or ""
     rev = document.revisionId
-    tab = ""
     counters: dict[str, int] = {
         "H": 0,
@@ -210,7 +212,73 @@ def build_docmd(document: Document) -> DocMD:  # noqa: C901
     blocks: list[DocMDBlock] = []
-    for se in document.body.content or [] if document.body else []:
+    if document.tabs and len(document.tabs) > 0:
+        flattened_tabs = _flatten_tabs_depth_first(document.tabs)
+        if tab_id:
+            matching_tabs = [
+                t for t in flattened_tabs if t.tabProperties and t.tabProperties.tabId == tab_id
+            ]
+            if not matching_tabs:
+                available_ids = [t.tabProperties.tabId for t in flattened_tabs if t.tabProperties]
+                raise RetryableToolError(
+                    message=f"Tab with ID '{tab_id}' not found in document",
+                    additional_prompt_content=f"Available tab IDs: {available_ids}",
+                    retry_after_ms=100,
+                )
+            flattened_tabs = matching_tabs
+        for tab_obj in flattened_tabs:
+            if not tab_obj.documentTab or not tab_obj.tabProperties:
+                continue
+            tab_metadata = {
+                "tabId": tab_obj.tabProperties.tabId or "",
+                "title": tab_obj.tabProperties.title or "",
+                "nestingLevel": str(tab_obj.tabProperties.nestingLevel or 0),
+                "index": str(tab_obj.tabProperties.index or 0),
+            }
+            if tab_obj.tabProperties.parentTabId:
+                tab_metadata["parentTabId"] = tab_obj.tabProperties.parentTabId
+            body_content = []
+            if tab_obj.documentTab.body and tab_obj.documentTab.body.content:
+                body_content = tab_obj.documentTab.body.content
+            _process_body_content(
+                body_content,
+                next_id,
+                tab_metadata,
+                blocks,
+            )
+    else:
+        body_content = []
+        if document.body and document.body.content:
+            body_content = document.body.content
+        _process_body_content(
+            body_content,
+            next_id,
+            {},
+            blocks,
+        )
+    return DocMD(documentId=doc_id, revisionId=rev, tab="", blocks=blocks)
+def _process_body_content(
+    content: list[StructuralElement],
+    next_id_func: Callable[[str], str],
+    tab_metadata: dict[str, str],
+    blocks: list[DocMDBlock],
+) -> None:
+    """Process structural elements from a body (main document or tab).
+    Args:
+        content: List of structural elements to process
+        next_id_func: Function to generate unique block IDs
+        tab_metadata: Dict with tab information (tabId, title, nestingLevel, etc.)
+        blocks: List to append processed blocks to
+    """
+    for se in content:
         if se.paragraph is not None:
             p: Paragraph = se.paragraph
             named = p.paragraphStyle.namedStyleType if p.paragraphStyle else None
@@ -225,34 +293,30 @@ def build_docmd(document: Document) -> DocMD:  # noqa: C901
             block_type: str
             block_id: str
-            attrs: dict[str, str] = {}
-            # Only add tab attribute if it's not empty (not the default tab)
-            if tab:
-                attrs["tab"] = tab
+            attrs: dict[str, str] = tab_metadata.copy() if tab_metadata else {}
             if is_heading:
                 level = int(str(named).split("_")[-1])
                 block_type = f"HEADING_{level}"
-                block_id = next_id("H")
+                block_id = next_id_func("H")
                 if p.paragraphStyle and p.paragraphStyle.headingId:
                     attrs["headingId"] = p.paragraphStyle.headingId
             else:
                 if p.bullet and p.bullet.listId:
                     block_type = DocMDBlockType.UL_ITEM.value
-                    block_id = next_id("UL")
+                    block_id = next_id_func("UL")
                     attrs["listId"] = p.bullet.listId
                     if p.bullet.nestingLevel is not None:
                         attrs["level"] = str(p.bullet.nestingLevel)
                 else:
                     block_type = DocMDBlockType.PARAGRAPH.value
-                    block_id = next_id("P")
+                    block_id = next_id_func("P")
             vis_start, vis_end, text, style_runs = _visible_span_and_text(p)
             start = vis_start if vis_start is not None else se.startIndex or 0
             end = vis_end if vis_end is not None else se.endIndex or start
             text_line = (text or "").rstrip("\n")
-            # Add style ranges to attrs if any styles are present
             if style_runs:
                 style_ranges = _format_style_ranges(style_runs, start)
                 if style_ranges:
@@ -270,25 +334,19 @@ def build_docmd(document: Document) -> DocMD:  # noqa: C901
             )
         elif se.table is not None:
-            _process_table(se.table, se, next_id, tab, blocks)
-    return DocMD(documentId=doc_id, revisionId=rev, tab=tab, blocks=blocks)
+            _process_table(se.table, se, next_id_func, tab_metadata, blocks)
 def _process_table(  # type: ignore[no-untyped-def]
     table: Table,
     se,
     next_id_func: Callable[[str], str],
-    tab: str,
+    tab_metadata: dict[str, str],
     blocks: list[DocMDBlock],
 ) -> None:
     """Process a table structural element and add table/row/cell blocks."""
     table_id = next_id_func("TABLE")
-    table_attrs: dict[str, str] = {}
-    # Only add tab attribute if it's not empty (not the default tab)
-    if tab:
-        table_attrs["tab"] = tab
+    table_attrs: dict[str, str] = tab_metadata.copy() if tab_metadata else {}
     if table.rows is not None:
         table_attrs["rows"] = str(table.rows)
@@ -310,7 +368,7 @@ def _process_table(  # type: ignore[no-untyped-def]
     )
     for row_idx, table_row in enumerate(table.tableRows or []):
-        _process_table_row(table_row, row_idx, table_start, next_id_func, tab, blocks)
+        _process_table_row(table_row, row_idx, table_start, next_id_func, tab_metadata, blocks)
 def _process_table_row(  # type: ignore[no-untyped-def]
@@ -318,15 +376,13 @@ def _process_table_row(  # type: ignore[no-untyped-def]
     row_idx: int,
     table_start: int,
     next_id_func: Callable[[str], str],
-    tab: str,
+    tab_metadata: dict[str, str],
     blocks: list[DocMDBlock],
 ) -> None:
     """Process a table row and add row/cell blocks."""
     row_id = next_id_func("TR")
-    row_attrs: dict[str, str] = {"row": str(row_idx)}
-    if tab:
-        row_attrs["tab"] = tab
+    row_attrs: dict[str, str] = tab_metadata.copy() if tab_metadata else {}
+    row_attrs["row"] = str(row_idx)
     row_start = table_row.startIndex or table_start
     row_end = table_row.endIndex or row_start
@@ -343,27 +399,25 @@ def _process_table_row(  # type: ignore[no-untyped-def]
     )
     for cell_idx, table_cell in enumerate(table_row.tableCells or []):
-        _process_table_cell(table_cell, row_idx, cell_idx, row_start, next_id_func, tab, blocks)
+        _process_table_cell(
+            table_cell, row_idx, cell_idx, row_start, next_id_func, tab_metadata, blocks
+        )
-def _process_table_cell(  # type: ignore[no-untyped-def]  # noqa: C901
+def _process_table_cell(  # type: ignore[no-untyped-def]
     table_cell,
     row_idx: int,
     cell_idx: int,
     row_start: int,
     next_id_func: Callable[[str], str],
-    tab: str,
+    tab_metadata: dict[str, str],
     blocks: list[DocMDBlock],
 ) -> None:
     """Process a table cell and add cell block."""
     cell_id = next_id_func("TC")
-    cell_attrs: dict[str, str] = {
-        "row": str(row_idx),
-        "col": str(cell_idx),
-    }
-    if tab:
-        cell_attrs["tab"] = tab
+    cell_attrs: dict[str, str] = tab_metadata.copy() if tab_metadata else {}
+    cell_attrs["row"] = str(row_idx)
+    cell_attrs["col"] = str(cell_idx)
     # Add cell styling attributes if present
     if (
@@ -532,3 +586,28 @@ def _format_style_ranges(style_runs: list[dict], block_start: int) -> str:
             consolidated.append(f"{style_str}:{abs_start}-{abs_end}")
     return ",".join(consolidated) if consolidated else ""
+def _flatten_tabs_depth_first(
+    tabs: list[Tab] | None, max_depth: int = 4, current_depth: int = 0
+) -> list[Tab]:
+    """Flatten tab hierarchy using depth-first traversal.
+    Args:
+        tabs: List of Tab objects, potentially with nested childTabs
+        max_depth: Maximum recursion depth (Google Docs enforces 3 levels, using 4 for safety)
+        current_depth: Current recursion depth (internal use)
+    Returns:
+        Flattened list of tabs in depth-first order (parent → children → grandchildren)
+    """
+    if not tabs or current_depth >= max_depth:
+        return []
+    result: list[Tab] = []
+    for tab in tabs:
+        result.append(tab)
+        if tab.childTabs:
+            result.extend(_flatten_tabs_depth_first(tab.childTabs, max_depth, current_depth + 1))
+    return result

arcade_google_docs/models/responses.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""
+TypedDict response models for Google Docs tools.
+These models define the structure of responses returned by Google Docs tools,
+with field descriptions as string literals for tool compatibility.
+"""
+from typing import TypedDict
+class TabMetadata(TypedDict, total=False):
+    """Metadata for a single tab in a Google Docs document."""
+    tabId: str
+    """The unique identifier of the tab."""
+    title: str
+    """The title/name of the tab."""
+    index: int
+    """The position of the tab among its siblings (0-indexed)."""
+    nestingLevel: int
+    """The nesting depth (0 for top-level, 1 for child, 2 for grandchild)."""
+    approximateCharacterCount: int
+    """Approximate number of characters in this tab's content (excluding child tabs)."""
+    approximateWordCount: int
+    """Approximate number of words in this tab's content (excluding child tabs)."""
+    parentTabId: str
+    """The ID of the parent tab (if this is a nested tab)."""
+    childTabs: list[dict]
+    """List of nested child tabs within this tab (each follows TabMetadata structure)."""
+class DocumentMetadata(TypedDict):
+    """Complete metadata for a Google Docs document including tab hierarchy."""
+    documentId: str
+    """The unique identifier of the document."""
+    title: str
+    """The title of the document."""
+    documentUrl: str
+    """The URL to open and edit the document in Google Docs."""
+    approximateTotalCharacterCount: int
+    """Approximate total number of characters across all tabs (or main body if no tabs)."""
+    approximateTotalWordCount: int
+    """Approximate total number of words across all tabs (or main body if no tabs)."""
+    tabsCount: int
+    """The total number of tabs in the document."""
+    tabs: list[dict]
+    """List of tabs with hierarchical structure (each follows TabMetadata structure)."""
+class DocumentContentResult(TypedDict):
+    """A document with its content in a specific format and metadata."""
+    documentId: str
+    """The unique identifier of the document."""
+    title: str
+    """The title of the document."""
+    documentUrl: str
+    """The URL to open and edit the document in Google Docs."""
+    content: str
+    """The document content in the requested format (markdown, HTML, or DocMD)."""
+    format: str
+    """The format of the content: 'markdown', 'html', 'docmd', or 'google_api_json'."""
+    tabs_count: int
+    """The number of tabs in the document (0 if no tabs)."""
+    total_character_count: int
+    """Approximate total character count across all tabs or main body if no tabs."""
+    total_word_count: int
+    """Approximate total word count across all tabs or main body if no tabs."""
+    main_body_character_count: int
+    """Approximate character count of the main body content only (0 if document has tabs)."""
+    main_body_word_count: int
+    """Approximate word count of the main body content only (0 if document has tabs)."""
+class DocumentListItem(TypedDict):
+    """Metadata for a document from search results."""
+    id: str
+    """The unique identifier of the document."""
+    name: str
+    """The name/title of the document."""
+    kind: str
+    """The kind of the resource (typically 'drive#file')."""
+    mimeType: str
+    """The MIME type (typically 'application/vnd.google-apps.document')."""
+class SearchDocumentsResponse(TypedDict, total=False):
+    """Response from search_documents with document metadata and pagination."""
+    documents_count: int
+    """The number of documents returned in this response."""
+    documents: list[dict]
+    """List of document metadata matching search criteria."""
+    pagination_token: str
+    """Token to retrieve the next page of results (if available)."""
+    has_more: bool
+    """Whether there are more documents available to retrieve."""
+class SearchAndRetrieveResponse(TypedDict, total=False):
+    """Response from search_and_retrieve_documents with full content and metadata."""
+    documents_count: int
+    """The number of documents returned in this response."""
+    documents: list[dict]
+    """List of documents with their content and metadata."""
+    pagination_token: str
+    """Token to retrieve the next page of results (if available)."""
+    has_more: bool
+    """Whether there are more documents available to retrieve."""

arcade_google_docs/tools/__init__.py CHANGED Viewed

@@ -7,7 +7,11 @@ from arcade_google_docs.tools.create import (
     create_document_from_text,
 )
 from arcade_google_docs.tools.file_picker import generate_google_file_picker_url
-from arcade_google_docs.tools.get import get_document_by_id
+from arcade_google_docs.tools.get import (
+    get_document_as_docmd,
+    get_document_by_id,
+    get_document_metadata,
+)
 from arcade_google_docs.tools.search import (
     search_and_retrieve_documents,
     search_documents,
@@ -18,7 +22,9 @@ from arcade_google_docs.tools.update import insert_text_at_end_of_document
 __all__ = [
     "create_blank_document",
     "create_document_from_text",
+    "get_document_as_docmd",
     "get_document_by_id",
+    "get_document_metadata",
     "comment_on_document",
     "list_document_comments",
     "insert_text_at_end_of_document",

arcade_google_docs/tools/edit_agent/utils.py CHANGED Viewed

@@ -15,7 +15,9 @@ def get_docmd(google_service: Any, document_id: str) -> DocMD:
     Returns:
         DocMD object
     """
-    google_get_response = google_service.documents().get(documentId=document_id).execute()
+    google_get_response = (
+        google_service.documents().get(documentId=document_id, includeTabsContent=True).execute()
+    )
     document = Document(**google_get_response)
     docmd = build_docmd(document)
     return docmd

arcade_google_docs/tools/get.py CHANGED Viewed

@@ -5,7 +5,15 @@ from arcade_tdk.auth import Google
 from arcade_google_docs.docmd import build_docmd
 from arcade_google_docs.models.document import Document
-from arcade_google_docs.utils import build_docs_service
+from arcade_google_docs.models.responses import DocumentMetadata
+from arcade_google_docs.utils import (
+    _calculate_character_count,
+    _calculate_word_count,
+    build_docs_service,
+    build_tab_metadata_recursive,
+    count_tab_chars_recursive,
+    count_tab_words_recursive,
+)
 # Uses https://developers.google.com/docs/api/reference/rest/v1/documents/get
@@ -28,9 +36,7 @@ async def get_document_by_id(
     """
     service = build_docs_service(context.get_auth_token_or_empty())
-    # Execute the documents().get() method. Returns a Document object
-    # https://developers.google.com/docs/api/reference/rest/v1/documents#Document
-    request = service.documents().get(documentId=document_id)
+    request = service.documents().get(documentId=document_id, includeTabsContent=True)
     response = request.execute()
     return dict(response)
@@ -45,14 +51,65 @@ async def get_document_by_id(
 async def get_document_as_docmd(
     context: ToolContext,
     document_id: Annotated[str, "The ID of the document to retrieve."],
+    tab_id: Annotated[
+        str | None,
+        "The ID of a specific tab to retrieve. If provided, returns only content from that tab. "
+        "If omitted, returns all tabs in sequential depth-first order.",
+    ] = None,
 ) -> Annotated[str, "The document contents as DocMD"]:
     """
     Get the latest version of the specified Google Docs document as DocMD.
     The DocMD output will include tags that can be used to annotate the document with location
-    information, the type of block, block IDs, and other metadata.
+    information, the type of block, block IDs, and other metadata. If the document has tabs,
+    all tabs are included in sequential order unless a specific tab_id is provided.
     """
     service = build_docs_service(context.get_auth_token_or_empty())
-    request = service.documents().get(documentId=document_id)
+    request = service.documents().get(documentId=document_id, includeTabsContent=True)
     response = request.execute()
-    return build_docmd(Document(**response)).to_string()
+    return build_docmd(Document(**response), tab_id=tab_id).to_string()
+@tool(
+    requires_auth=Google(
+        scopes=[
+            "https://www.googleapis.com/auth/drive.file",
+        ],
+    ),
+)
+async def get_document_metadata(
+    context: ToolContext,
+    document_id: Annotated[str, "The ID of the document to get metadata for"],
+) -> Annotated[DocumentMetadata, "Document metadata including hierarchical tab structure"]:
+    """
+    Get metadata for a Google Docs document including hierarchical tab structure.
+    Returns document title, ID, URL, total character count, and nested tab information
+    with character counts for each tab.
+    """
+    service = build_docs_service(context.get_auth_token_or_empty())
+    request = service.documents().get(documentId=document_id, includeTabsContent=True)
+    response = request.execute()
+    document = Document(**response)
+    total_char_count = 0
+    total_word_count = 0
+    tabs_metadata: list = []
+    if document.tabs and len(document.tabs) > 0:
+        tabs_metadata = build_tab_metadata_recursive(document.tabs)
+        total_char_count = sum(count_tab_chars_recursive(tab) for tab in tabs_metadata)
+        total_word_count = sum(count_tab_words_recursive(tab) for tab in tabs_metadata)
+    elif document.body:
+        total_char_count = _calculate_character_count(document.body.content)
+        total_word_count = _calculate_word_count(document.body.content)
+    return {
+        "documentId": document.documentId or "",
+        "title": document.title or "",
+        "documentUrl": f"https://docs.google.com/document/d/{document.documentId}/edit",
+        "approximateTotalCharacterCount": total_char_count,
+        "approximateTotalWordCount": total_word_count,
+        "tabsCount": len(tabs_metadata),
+        "tabs": tabs_metadata,
+    }

arcade_google_docs/tools/search.py CHANGED Viewed

@@ -3,15 +3,18 @@ from typing import Annotated, Any
 from arcade_tdk import ToolContext, tool
 from arcade_tdk.auth import Google
-from arcade_google_docs.doc_to_html import convert_document_to_html
-from arcade_google_docs.doc_to_markdown import convert_document_to_markdown
-from arcade_google_docs.docmd import build_docmd
 from arcade_google_docs.enum import DocumentFormat, OrderBy
 from arcade_google_docs.models.document import Document
+from arcade_google_docs.models.responses import (
+    SearchAndRetrieveResponse,
+    SearchDocumentsResponse,
+)
 from arcade_google_docs.tools import get_document_by_id
 from arcade_google_docs.utils import (
+    build_document_content_result,
     build_drive_service,
     build_files_list_params,
+    build_search_retrieve_response,
 )
@@ -63,12 +66,12 @@ async def search_documents(
         str | None, "The pagination token to continue a previous request"
     ] = None,
 ) -> Annotated[
-    dict,
-    "A dictionary containing 'documents_count' (number of documents returned) and 'documents' "
-    "(a list of document details including 'kind', 'mimeType', 'id', and 'name' for each document)",
+    SearchDocumentsResponse,
+    "Document count, list of documents, pagination token, and has_more flag",
 ]:
     """
-    Searches for documents in the user's Google Drive. Excludes documents that are in the trash.
+    Searches for documents in the user's Google Drive. Excludes documents in trash.
+    Returns metadata only. Use get_document_metadata or get_document_as_docmd for content.
     """
     if document_contains or document_not_contains:
         # Google drive API does not support other order_by values for
@@ -111,11 +114,17 @@ async def search_documents(
         if not pagination_token or len(batch) < page_size:
             break
-    return {
+    response_dict: dict = {
         "documents_count": len(files),
         "documents": files,
+        "has_more": pagination_token is not None,
     }
+    if pagination_token:
+        response_dict["pagination_token"] = pagination_token
+    return response_dict  # type: ignore[return-value]
 @tool(
     requires_auth=Google(
@@ -163,18 +172,18 @@ async def search_and_retrieve_documents(
         str | None, "The pagination token to continue a previous request"
     ] = None,
 ) -> Annotated[
-    dict,
-    "A dictionary containing 'documents_count' (number of documents returned) and 'documents' "
-    "(a list of documents with their content).",
+    SearchAndRetrieveResponse,
+    "A dictionary containing document count, list of documents with content and metadata, "
+    "pagination token, and has_more flag",
 ]:
     """
-    Searches for documents in the user's Google Drive and returns a list of documents (with text
-    content) matching the search criteria. Excludes documents that are in the trash.
+    Searches for documents in the user's Google Drive and returns documents with their main body
+    content and tab metadata. Excludes documents that are in the trash.
-    Note: use this tool only when the user prompt requires the documents' content. If the user only
-    needs a list of documents, use the `search_documents` tool instead.
+    Returns main body content only with metadata about tabs. Use get_document_as_docmd() to retrieve
+    full tab content for specific documents. Use search_documents() for metadata-only searches.
     """
-    response = await search_documents(
+    search_response = await search_documents(
         context=context,
         document_contains=document_contains,
         document_not_contains=document_not_contains,
@@ -186,21 +195,12 @@ async def search_and_retrieve_documents(
         pagination_token=pagination_token,
     )
-    documents = []
-    for item in response["documents"]:
-        document = await get_document_by_id(context, document_id=item["id"])
+    documents: list = []
+    for item in search_response["documents"]:
+        doc_dict = await get_document_by_id(context, document_id=item["id"])
+        document = Document(**doc_dict)
+        doc_result = build_document_content_result(document, doc_dict, return_format)
+        documents.append(doc_result)
-        if return_format == DocumentFormat.DOCMD:
-            document = build_docmd(Document(**document)).to_string()
-        elif return_format == DocumentFormat.MARKDOWN:
-            document = convert_document_to_markdown(document)
-        elif return_format == DocumentFormat.HTML:
-            document = convert_document_to_html(document)
-        documents.append(document)
-    return {
-        "documents_count": len(documents),
-        "documents": documents,
-    }
+    result = build_search_retrieve_response(documents, search_response)
+    return result  # type: ignore[return-value]

arcade-google-docs 4.3.1__py3-none-any.whl → 5.0.0__py3-none-any.whl

arcade-google-docs 4.3.1py3-none-any.whl → 5.0.0py3-none-any.whl