unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,848 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from dateutil import parser
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
from pydantic.functional_validators import BeforeValidator
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
FileData,
|
|
13
|
+
FileDataSourceMetadata,
|
|
14
|
+
SourceIdentifiers,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.error import SourceConnectionError, UserAuthError, ValueError
|
|
17
|
+
from unstructured_ingest.interfaces import (
|
|
18
|
+
AccessConfig,
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
Downloader,
|
|
21
|
+
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.logger import logger
|
|
27
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
28
|
+
from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
|
|
29
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from googleapiclient.discovery import Resource as GoogleAPIResource
|
|
33
|
+
|
|
34
|
+
CONNECTOR_TYPE = "google_drive"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Maps Google-native Drive MIME types → export MIME types
|
|
38
|
+
GOOGLE_EXPORT_MIME_MAP = {
|
|
39
|
+
"application/vnd.google-apps.document": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # noqa: E501
|
|
40
|
+
"application/vnd.google-apps.spreadsheet": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
|
|
41
|
+
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", # noqa: E501
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Maps export MIME types → file extensions
|
|
45
|
+
EXPORT_EXTENSION_MAP = {
|
|
46
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
47
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
48
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
49
|
+
"application/pdf": ".pdf",
|
|
50
|
+
"text/html": ".html",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# LRO Export Size Threshold is 10MB in real but the exported file might be slightly larger
|
|
54
|
+
# than the original Google Workspace file - thus the threshold is set to 9MB
|
|
55
|
+
LRO_EXPORT_SIZE_THRESHOLD = 9 * 1024 * 1024 # 9MB
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class GoogleDriveAccessConfig(AccessConfig):
|
|
59
|
+
service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|
|
60
|
+
default=None, description="Credentials values to use for authentication"
|
|
61
|
+
)
|
|
62
|
+
service_account_key_path: Optional[Path] = Field(
|
|
63
|
+
default=None,
|
|
64
|
+
description="File path to credentials values to use for authentication",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def model_post_init(self, __context: Any) -> None:
|
|
68
|
+
if self.service_account_key is None and self.service_account_key_path is None:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"either service_account_key or service_account_key_path must be provided"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_service_account_key(self) -> dict:
|
|
74
|
+
key_data = None
|
|
75
|
+
if self.service_account_key_path:
|
|
76
|
+
with self.service_account_key_path.open() as f:
|
|
77
|
+
key_data = json.load(f)
|
|
78
|
+
if key_data and self.service_account_key:
|
|
79
|
+
if key_data == self.service_account_key:
|
|
80
|
+
return key_data
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
"service_account_key and service_account_key_path "
|
|
84
|
+
"both provided and have different values"
|
|
85
|
+
)
|
|
86
|
+
if key_data:
|
|
87
|
+
return key_data
|
|
88
|
+
return self.service_account_key
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
92
|
+
drive_id: str = Field(description="Google Drive File or Folder ID.")
|
|
93
|
+
access_config: Secret[GoogleDriveAccessConfig]
|
|
94
|
+
|
|
95
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
96
|
+
@contextmanager
|
|
97
|
+
def get_client(self) -> Generator["GoogleAPIResource", None, None]:
|
|
98
|
+
from google.auth import exceptions
|
|
99
|
+
from google.oauth2 import service_account
|
|
100
|
+
from googleapiclient.discovery import build
|
|
101
|
+
from googleapiclient.errors import HttpError
|
|
102
|
+
|
|
103
|
+
access_config = self.access_config.get_secret_value()
|
|
104
|
+
key_data = access_config.get_service_account_key()
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
creds = service_account.Credentials.from_service_account_info(key_data)
|
|
108
|
+
service = build("drive", "v3", credentials=creds)
|
|
109
|
+
with service.files() as client:
|
|
110
|
+
yield client
|
|
111
|
+
except HttpError as exc:
|
|
112
|
+
raise ValueError(f"{exc.reason}")
|
|
113
|
+
except exceptions.DefaultCredentialsError:
|
|
114
|
+
raise UserAuthError("The provided API key is invalid.")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class GoogleDriveIndexerConfig(IndexerConfig):
|
|
118
|
+
extensions: Optional[list[str]] = None
|
|
119
|
+
recursive: bool = False
|
|
120
|
+
|
|
121
|
+
def model_post_init(self, __context: Any) -> None:
|
|
122
|
+
if self.extensions is not None:
|
|
123
|
+
self.extensions = [e.lstrip(".") for e in self.extensions]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class GoogleDriveIndexer(Indexer):
|
|
128
|
+
connection_config: GoogleDriveConnectionConfig
|
|
129
|
+
index_config: GoogleDriveIndexerConfig
|
|
130
|
+
fields: list[str] = field(
|
|
131
|
+
default_factory=lambda: [
|
|
132
|
+
"id",
|
|
133
|
+
"name",
|
|
134
|
+
"mimeType",
|
|
135
|
+
"fileExtension",
|
|
136
|
+
"md5Checksum",
|
|
137
|
+
"sha1Checksum",
|
|
138
|
+
"sha256Checksum",
|
|
139
|
+
"headRevisionId",
|
|
140
|
+
"permissions",
|
|
141
|
+
"createdTime",
|
|
142
|
+
"modifiedTime",
|
|
143
|
+
"version",
|
|
144
|
+
"originalFilename",
|
|
145
|
+
"capabilities",
|
|
146
|
+
"permissionIds",
|
|
147
|
+
"size",
|
|
148
|
+
]
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def verify_drive_api_enabled(client) -> None:
|
|
153
|
+
from googleapiclient.errors import HttpError
|
|
154
|
+
|
|
155
|
+
"""
|
|
156
|
+
Makes a lightweight API call to verify that the Drive API is enabled.
|
|
157
|
+
If the API is not enabled, an HttpError should be raised.
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
# A very minimal call: list 1 file from the drive.
|
|
161
|
+
client.list(
|
|
162
|
+
supportsAllDrives=True,
|
|
163
|
+
includeItemsFromAllDrives=True,
|
|
164
|
+
spaces="drive",
|
|
165
|
+
pageSize=1,
|
|
166
|
+
fields="files(id)",
|
|
167
|
+
).execute()
|
|
168
|
+
except HttpError as e:
|
|
169
|
+
error_content = e.content.decode() if hasattr(e, "content") else ""
|
|
170
|
+
lower_error = error_content.lower()
|
|
171
|
+
if "drive api" in lower_error and (
|
|
172
|
+
"not enabled" in lower_error or "not been used" in lower_error
|
|
173
|
+
):
|
|
174
|
+
raise SourceConnectionError(
|
|
175
|
+
"Google Drive API is not enabled for your project. \
|
|
176
|
+
Please enable it in the Google Cloud Console."
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def count_files_recursively(
|
|
183
|
+
files_client: "GoogleAPIResource", folder_id: str, extensions: list[str] = None
|
|
184
|
+
) -> int:
|
|
185
|
+
"""
|
|
186
|
+
Count non-folder files recursively under the given folder.
|
|
187
|
+
If `extensions` is provided, only count files
|
|
188
|
+
whose `fileExtension` matches one of the values.
|
|
189
|
+
"""
|
|
190
|
+
count = 0
|
|
191
|
+
stack = [folder_id]
|
|
192
|
+
# Pre-compute lower-case extension set for O(1) lookup
|
|
193
|
+
valid_exts = set(e.lower() for e in extensions) if extensions else None
|
|
194
|
+
|
|
195
|
+
while stack:
|
|
196
|
+
current_folder = stack.pop()
|
|
197
|
+
# Always list all items under the current folder.
|
|
198
|
+
query = f"'{current_folder}' in parents"
|
|
199
|
+
page_token = None
|
|
200
|
+
while True:
|
|
201
|
+
response = files_client.list(
|
|
202
|
+
supportsAllDrives=True,
|
|
203
|
+
includeItemsFromAllDrives=True,
|
|
204
|
+
spaces="drive",
|
|
205
|
+
q=query,
|
|
206
|
+
fields="nextPageToken, files(id, mimeType, fileExtension)",
|
|
207
|
+
pageToken=page_token,
|
|
208
|
+
pageSize=1000,
|
|
209
|
+
).execute()
|
|
210
|
+
for item in response.get("files", []):
|
|
211
|
+
if item.get("mimeType") == "application/vnd.google-apps.folder":
|
|
212
|
+
# Always traverse sub-folders regardless of extension filter.
|
|
213
|
+
stack.append(item["id"])
|
|
214
|
+
else:
|
|
215
|
+
if extensions:
|
|
216
|
+
# Use a case-insensitive comparison for the file extension.
|
|
217
|
+
file_ext = (item.get("fileExtension") or "").lower()
|
|
218
|
+
if file_ext in valid_exts:
|
|
219
|
+
count += 1
|
|
220
|
+
else:
|
|
221
|
+
count += 1
|
|
222
|
+
page_token = response.get("nextPageToken")
|
|
223
|
+
if not page_token:
|
|
224
|
+
break
|
|
225
|
+
return count
|
|
226
|
+
|
|
227
|
+
def precheck(self) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Enhanced precheck that verifies not only connectivity
|
|
230
|
+
but also that the provided drive_id is valid and accessible.
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
with self.connection_config.get_client() as client:
|
|
234
|
+
# First, verify that the Drive API is enabled.
|
|
235
|
+
self.verify_drive_api_enabled(client)
|
|
236
|
+
|
|
237
|
+
# Try to retrieve metadata for the drive id.
|
|
238
|
+
# This will catch errors such as an invalid drive id or insufficient permissions.
|
|
239
|
+
root_info = self.get_root_info(
|
|
240
|
+
files_client=client, object_id=self.connection_config.drive_id
|
|
241
|
+
)
|
|
242
|
+
logger.info(
|
|
243
|
+
f"Successfully retrieved drive root info: "
|
|
244
|
+
f"{root_info.get('name', 'Unnamed')} (ID: {root_info.get('id')})"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# If the target is a folder, perform file count check.
|
|
248
|
+
if self.is_dir(root_info):
|
|
249
|
+
if self.index_config.recursive:
|
|
250
|
+
file_count = self.count_files_recursively(
|
|
251
|
+
client,
|
|
252
|
+
self.connection_config.drive_id,
|
|
253
|
+
extensions=self.index_config.extensions,
|
|
254
|
+
)
|
|
255
|
+
if file_count == 0:
|
|
256
|
+
logger.warning(
|
|
257
|
+
"Empty folder: no files found recursively in the folder. \
|
|
258
|
+
Please verify that the folder contains files and \
|
|
259
|
+
that the service account has proper permissions."
|
|
260
|
+
)
|
|
261
|
+
# raise SourceConnectionError(
|
|
262
|
+
# "Empty folder: no files found recursively in the folder. "
|
|
263
|
+
# "Please verify that the folder contains files and \
|
|
264
|
+
# that the service account has proper permissions."
|
|
265
|
+
# )
|
|
266
|
+
else:
|
|
267
|
+
logger.info(f"Found {file_count} files recursively in the folder.")
|
|
268
|
+
else:
|
|
269
|
+
# Non-recursive: check for at least one immediate non-folder child.
|
|
270
|
+
response = client.list(
|
|
271
|
+
supportsAllDrives=True,
|
|
272
|
+
includeItemsFromAllDrives=True,
|
|
273
|
+
spaces="drive",
|
|
274
|
+
fields="files(id)",
|
|
275
|
+
pageSize=1,
|
|
276
|
+
q=f"'{self.connection_config.drive_id}' in parents",
|
|
277
|
+
).execute()
|
|
278
|
+
if not response.get("files"):
|
|
279
|
+
logger.warning(
|
|
280
|
+
"Empty folder: no files found at the folder's root level. "
|
|
281
|
+
"Please verify that the folder contains files and \
|
|
282
|
+
that the service account has proper permissions."
|
|
283
|
+
)
|
|
284
|
+
# raise SourceConnectionError(
|
|
285
|
+
# "Empty folder: no files found at the folder's root level. "
|
|
286
|
+
# "Please verify that the folder contains files and \
|
|
287
|
+
# that the service account has proper permissions."
|
|
288
|
+
# )
|
|
289
|
+
else:
|
|
290
|
+
logger.info("Found files at the folder's root level.")
|
|
291
|
+
else:
|
|
292
|
+
# If the target is a file, precheck passes.
|
|
293
|
+
logger.info("Drive ID corresponds to a file. Precheck passed.")
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.error(
|
|
297
|
+
"Failed to validate Google Drive connection during precheck",
|
|
298
|
+
exc_info=True,
|
|
299
|
+
)
|
|
300
|
+
raise SourceConnectionError(f"Precheck failed: {e}")
|
|
301
|
+
|
|
302
|
+
@staticmethod
|
|
303
|
+
def is_dir(record: dict) -> bool:
|
|
304
|
+
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
305
|
+
|
|
306
|
+
@staticmethod
|
|
307
|
+
def map_file_data(root_info: dict) -> FileData:
|
|
308
|
+
file_id = root_info["id"]
|
|
309
|
+
filename = root_info.pop("name")
|
|
310
|
+
url = root_info.pop("webContentLink", None)
|
|
311
|
+
version = root_info.pop("version", None)
|
|
312
|
+
permissions = root_info.pop("permissions", None)
|
|
313
|
+
date_created_str = root_info.pop("createdTime", None)
|
|
314
|
+
date_created_dt = parser.parse(date_created_str) if date_created_str else None
|
|
315
|
+
date_modified_str = root_info.pop("modifiedTime", None)
|
|
316
|
+
parent_path = root_info.pop("parent_path", None)
|
|
317
|
+
parent_root_path = root_info.pop("parent_root_path", None)
|
|
318
|
+
date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
|
|
319
|
+
if (
|
|
320
|
+
parent_path
|
|
321
|
+
and isinstance(parent_path, str)
|
|
322
|
+
and parent_root_path
|
|
323
|
+
and isinstance(parent_root_path, str)
|
|
324
|
+
):
|
|
325
|
+
fullpath = f"{parent_path}/{filename}"
|
|
326
|
+
rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
|
|
327
|
+
source_identifiers = SourceIdentifiers(
|
|
328
|
+
filename=filename, fullpath=fullpath, rel_path=rel_path
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
source_identifiers = SourceIdentifiers(fullpath=filename, filename=filename)
|
|
332
|
+
return FileData(
|
|
333
|
+
connector_type=CONNECTOR_TYPE,
|
|
334
|
+
identifier=file_id,
|
|
335
|
+
source_identifiers=source_identifiers,
|
|
336
|
+
metadata=FileDataSourceMetadata(
|
|
337
|
+
url=url,
|
|
338
|
+
version=version,
|
|
339
|
+
date_created=str(date_created_dt.timestamp()),
|
|
340
|
+
date_modified=str(date_modified_dt.timestamp()),
|
|
341
|
+
permissions_data=permissions,
|
|
342
|
+
record_locator={
|
|
343
|
+
"file_id": file_id,
|
|
344
|
+
},
|
|
345
|
+
),
|
|
346
|
+
additional_metadata=root_info,
|
|
347
|
+
display_name=source_identifiers.fullpath,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
def get_paginated_results(
|
|
351
|
+
self,
|
|
352
|
+
files_client,
|
|
353
|
+
object_id: str,
|
|
354
|
+
extensions: Optional[list[str]] = None,
|
|
355
|
+
recursive: bool = False,
|
|
356
|
+
previous_path: Optional[str] = None,
|
|
357
|
+
) -> list[dict]:
|
|
358
|
+
fields_input = "nextPageToken, files({})".format(",".join(self.fields))
|
|
359
|
+
q = f"'{object_id}' in parents"
|
|
360
|
+
# Filter by extension but still include any directories
|
|
361
|
+
if extensions:
|
|
362
|
+
ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
|
|
363
|
+
q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
|
|
364
|
+
logger.debug(f"query used when indexing: {q}")
|
|
365
|
+
logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
|
|
366
|
+
done = False
|
|
367
|
+
page_token = None
|
|
368
|
+
files_response = []
|
|
369
|
+
while not done:
|
|
370
|
+
response: dict = files_client.list(
|
|
371
|
+
supportsAllDrives=True,
|
|
372
|
+
includeItemsFromAllDrives=True,
|
|
373
|
+
spaces="drive",
|
|
374
|
+
fields=fields_input,
|
|
375
|
+
corpora="user",
|
|
376
|
+
pageToken=page_token,
|
|
377
|
+
q=q,
|
|
378
|
+
).execute()
|
|
379
|
+
if files := response.get("files", []):
|
|
380
|
+
fs = [f for f in files if not self.is_dir(record=f)]
|
|
381
|
+
for r in fs:
|
|
382
|
+
r["parent_path"] = previous_path
|
|
383
|
+
dirs = [f for f in files if self.is_dir(record=f)]
|
|
384
|
+
files_response.extend(fs)
|
|
385
|
+
if recursive:
|
|
386
|
+
for d in dirs:
|
|
387
|
+
dir_id = d["id"]
|
|
388
|
+
dir_name = d["name"]
|
|
389
|
+
files_response.extend(
|
|
390
|
+
self.get_paginated_results(
|
|
391
|
+
files_client=files_client,
|
|
392
|
+
object_id=dir_id,
|
|
393
|
+
extensions=extensions,
|
|
394
|
+
recursive=recursive,
|
|
395
|
+
previous_path=f"{previous_path}/{dir_name}",
|
|
396
|
+
)
|
|
397
|
+
)
|
|
398
|
+
page_token = response.get("nextPageToken")
|
|
399
|
+
if page_token is None:
|
|
400
|
+
done = True
|
|
401
|
+
for r in files_response:
|
|
402
|
+
r["parent_root_path"] = previous_path
|
|
403
|
+
return files_response
|
|
404
|
+
|
|
405
|
+
def get_root_info(self, files_client, object_id: str) -> dict:
|
|
406
|
+
return files_client.get(
|
|
407
|
+
supportsAllDrives=True, fileId=object_id, fields=",".join(self.fields)
|
|
408
|
+
).execute()
|
|
409
|
+
|
|
410
|
+
def get_files(
|
|
411
|
+
self,
|
|
412
|
+
files_client,
|
|
413
|
+
object_id: str,
|
|
414
|
+
recursive: bool = False,
|
|
415
|
+
extensions: Optional[list[str]] = None,
|
|
416
|
+
) -> list[FileData]:
|
|
417
|
+
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
418
|
+
if not self.is_dir(root_info):
|
|
419
|
+
root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
|
|
420
|
+
data = [self.map_file_data(root_info)]
|
|
421
|
+
else:
|
|
422
|
+
file_contents = self.get_paginated_results(
|
|
423
|
+
files_client=files_client,
|
|
424
|
+
object_id=object_id,
|
|
425
|
+
extensions=extensions,
|
|
426
|
+
recursive=recursive,
|
|
427
|
+
previous_path=root_info["name"],
|
|
428
|
+
)
|
|
429
|
+
data = []
|
|
430
|
+
for f in file_contents:
|
|
431
|
+
f["permissions"] = self.extract_permissions(f.get("permissions"))
|
|
432
|
+
data.append(self.map_file_data(root_info=f))
|
|
433
|
+
for d in data:
|
|
434
|
+
d.metadata.record_locator["drive_id"]: object_id
|
|
435
|
+
return data
|
|
436
|
+
|
|
437
|
+
def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
|
|
438
|
+
if not permissions:
|
|
439
|
+
logger.debug("no permissions found")
|
|
440
|
+
return [{}]
|
|
441
|
+
|
|
442
|
+
# https://developers.google.com/workspace/drive/api/guides/ref-roles
|
|
443
|
+
role_mapping = {
|
|
444
|
+
"owner": ["read", "update", "delete"],
|
|
445
|
+
"organizer": ["read", "update", "delete"],
|
|
446
|
+
"fileOrganizer": ["read", "update"],
|
|
447
|
+
"writer": ["read", "update"],
|
|
448
|
+
"commenter": ["read"],
|
|
449
|
+
"reader": ["read"],
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
normalized_permissions = {
|
|
453
|
+
"read": {"users": set(), "groups": set()},
|
|
454
|
+
"update": {"users": set(), "groups": set()},
|
|
455
|
+
"delete": {"users": set(), "groups": set()},
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
for item in permissions:
|
|
459
|
+
# https://developers.google.com/workspace/drive/api/reference/rest/v3/permissions
|
|
460
|
+
# ignore permissions for "anyone" and "domain"
|
|
461
|
+
if item["type"] in ["user", "group"]:
|
|
462
|
+
type_key = item["type"] + "s"
|
|
463
|
+
for operation in role_mapping[item["role"]]:
|
|
464
|
+
normalized_permissions[operation][type_key].add(item["id"])
|
|
465
|
+
|
|
466
|
+
# turn sets into sorted lists for consistency and json serialization
|
|
467
|
+
for role_dict in normalized_permissions.values():
|
|
468
|
+
for key in role_dict:
|
|
469
|
+
role_dict[key] = sorted(role_dict[key])
|
|
470
|
+
|
|
471
|
+
logger.debug(f"normalized permissions generated: {normalized_permissions}")
|
|
472
|
+
return [{k: v} for k, v in normalized_permissions.items()]
|
|
473
|
+
|
|
474
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
475
|
+
with self.connection_config.get_client() as client:
|
|
476
|
+
for f in self.get_files(
|
|
477
|
+
files_client=client,
|
|
478
|
+
object_id=self.connection_config.drive_id,
|
|
479
|
+
recursive=self.index_config.recursive,
|
|
480
|
+
extensions=self.index_config.extensions,
|
|
481
|
+
):
|
|
482
|
+
yield f
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
486
|
+
lro_max_tries: int = 10
|
|
487
|
+
lro_max_time: int = 10 * 60 # 10 minutes
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _get_extension(file_data: FileData) -> str:
|
|
491
|
+
"""
|
|
492
|
+
Returns the extension for a given source MIME type.
|
|
493
|
+
"""
|
|
494
|
+
source_mime_type = file_data.additional_metadata.get("export_mime_type", "")
|
|
495
|
+
export_mime_type = GOOGLE_EXPORT_MIME_MAP.get(source_mime_type, "")
|
|
496
|
+
if export_mime_type:
|
|
497
|
+
return EXPORT_EXTENSION_MAP.get(export_mime_type, "")
|
|
498
|
+
return ""
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@dataclass
|
|
502
|
+
class GoogleDriveDownloader(Downloader):
|
|
503
|
+
"""
|
|
504
|
+
Downloads files from Google Drive using googleapis client. For native files, it uses the export
|
|
505
|
+
functionality for files <10MB and LRO (Long Running Operation) for files >10MB.
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
connection_config: GoogleDriveConnectionConfig
|
|
509
|
+
download_config: GoogleDriveDownloaderConfig = field(
|
|
510
|
+
default_factory=lambda: GoogleDriveDownloaderConfig()
|
|
511
|
+
)
|
|
512
|
+
connector_type: str = CONNECTOR_TYPE
|
|
513
|
+
|
|
514
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
515
|
+
def _direct_download_file(self, file_id, download_path: Path):
|
|
516
|
+
"""Downloads a file from Google Drive using the Drive API's media download functionality.
|
|
517
|
+
The method uses Google Drive API's media download functionality to stream the file
|
|
518
|
+
content directly to disk.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
file_id (str): The ID of the file to download from Google Drive.
|
|
522
|
+
download_path (Path): The local path where the file should be saved.
|
|
523
|
+
|
|
524
|
+
Raises:
|
|
525
|
+
SourceConnectionError: If the download operation fails.
|
|
526
|
+
"""
|
|
527
|
+
from googleapiclient.errors import HttpError
|
|
528
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
529
|
+
|
|
530
|
+
try:
|
|
531
|
+
with self.connection_config.get_client() as client:
|
|
532
|
+
# pylint: disable=maybe-no-member
|
|
533
|
+
request = client.get_media(fileId=file_id)
|
|
534
|
+
|
|
535
|
+
with open(download_path, "wb") as file:
|
|
536
|
+
downloader = MediaIoBaseDownload(file, request)
|
|
537
|
+
done = False
|
|
538
|
+
while done is False:
|
|
539
|
+
status, done = downloader.next_chunk()
|
|
540
|
+
logger.debug(f"Download progress:{int(status.progress() * 100)}.")
|
|
541
|
+
|
|
542
|
+
except (HttpError, ValueError) as error:
|
|
543
|
+
logger.exception(f"Error downloading file {file_id} to {download_path}: {error}")
|
|
544
|
+
raise SourceConnectionError("Failed to download file") from error
|
|
545
|
+
|
|
546
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
547
|
+
def _export_gdrive_file_with_lro(self, file_id: str, download_path: Path, mime_type: str):
|
|
548
|
+
"""Exports a Google Drive file using Long-Running Operation (LRO) for large files
|
|
549
|
+
(>10MB of the exported file size).
|
|
550
|
+
|
|
551
|
+
This method is used when the standard export method fails due to file size limitations.
|
|
552
|
+
It uses the Drive API's LRO functionality to handle large file exports.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
file_id (str): The ID of the Google Drive file to export.
|
|
556
|
+
download_path (Path): The local path where the exported file should be saved.
|
|
557
|
+
mime_type (str): The target MIME type for the exported file.
|
|
558
|
+
Raises:
|
|
559
|
+
SourceConnectionError: If the export operation fails.
|
|
560
|
+
"""
|
|
561
|
+
|
|
562
|
+
import tenacity
|
|
563
|
+
from googleapiclient.errors import HttpError
|
|
564
|
+
|
|
565
|
+
max_time = self.download_config.lro_max_time
|
|
566
|
+
max_tries = self.download_config.lro_max_tries
|
|
567
|
+
|
|
568
|
+
class OperationNotFinished(Exception):
|
|
569
|
+
"""
|
|
570
|
+
Exception raised when the operation is not finished.
|
|
571
|
+
"""
|
|
572
|
+
|
|
573
|
+
pass
|
|
574
|
+
|
|
575
|
+
def is_fatal_code(e: Exception) -> bool:
|
|
576
|
+
"""
|
|
577
|
+
Returns True if the error is fatal and should not be retried.
|
|
578
|
+
403 and 429 can mean "Too many requests" or "User rate limit exceeded"
|
|
579
|
+
which should be retried.
|
|
580
|
+
"""
|
|
581
|
+
return (
|
|
582
|
+
isinstance(e, HttpError)
|
|
583
|
+
and 400 <= e.resp.status < 500
|
|
584
|
+
and e.resp.status not in [403, 429]
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
@tenacity.retry(
|
|
588
|
+
wait=tenacity.wait_exponential(),
|
|
589
|
+
retry=tenacity.retry_if_exception(
|
|
590
|
+
lambda e: (
|
|
591
|
+
isinstance(e, (HttpError, OperationNotFinished)) and not is_fatal_code(e)
|
|
592
|
+
)
|
|
593
|
+
),
|
|
594
|
+
stop=(tenacity.stop_after_attempt(max_tries) | tenacity.stop_after_delay(max_time)),
|
|
595
|
+
)
|
|
596
|
+
def _poll_operation(operation: dict, operations_client: "GoogleAPIResource") -> dict:
|
|
597
|
+
"""
|
|
598
|
+
Helper function to poll the operation until it's complete.
|
|
599
|
+
Uses backoff exponential retry logic.
|
|
600
|
+
|
|
601
|
+
Each `operations.get` call uses the Google API requests limit. Details:
|
|
602
|
+
https://developers.google.com/workspace/drive/api/guides/limits
|
|
603
|
+
|
|
604
|
+
The limits as of May 2025 are:
|
|
605
|
+
- 12.000 calls per 60 seconds
|
|
606
|
+
|
|
607
|
+
In case of request limitting, the API will return 403 `User rate limit exceeded` error
|
|
608
|
+
or 429 `Too many requests` error.
|
|
609
|
+
"""
|
|
610
|
+
if operation.get("done", False):
|
|
611
|
+
return operation
|
|
612
|
+
if "error" in operation:
|
|
613
|
+
raise SourceConnectionError(
|
|
614
|
+
f"Export operation failed: {operation['error']['message']}"
|
|
615
|
+
)
|
|
616
|
+
# Refresh the operation status:
|
|
617
|
+
# FYI: In some cases the `operations.get` call errors with 403 "User does not have
|
|
618
|
+
# permission" error even if the same user create the operation with `download` method.
|
|
619
|
+
updated_operation = operations_client.get(name=operation["name"]).execute()
|
|
620
|
+
if not updated_operation.get("done", False):
|
|
621
|
+
raise OperationNotFinished()
|
|
622
|
+
return updated_operation
|
|
623
|
+
|
|
624
|
+
try:
|
|
625
|
+
with self._get_files_and_operations_client() as (files_client, operations_client):
|
|
626
|
+
# Start the LRO
|
|
627
|
+
operation = files_client.download(fileId=file_id, mimeType=mime_type).execute()
|
|
628
|
+
|
|
629
|
+
# In case the operation is not finished, poll it until it's complete
|
|
630
|
+
updated_operation = _poll_operation(operation, operations_client)
|
|
631
|
+
|
|
632
|
+
# Get the download URI from the completed operation
|
|
633
|
+
download_uri = updated_operation["response"]["downloadUri"]
|
|
634
|
+
|
|
635
|
+
# Download the file using the URI
|
|
636
|
+
self._raw_download_google_drive_file(download_uri, download_path)
|
|
637
|
+
|
|
638
|
+
except HttpError as error:
|
|
639
|
+
raise SourceConnectionError(
|
|
640
|
+
f"Failed to export file using Google Drive LRO: {error}"
|
|
641
|
+
) from error
|
|
642
|
+
|
|
643
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
644
|
+
def _export_gdrive_native_file(
|
|
645
|
+
self, file_id: str, download_path: Path, mime_type: str, file_size: int
|
|
646
|
+
):
|
|
647
|
+
"""Exports a Google Drive native file (Docs, Sheets, Slides) to a specified format.
|
|
648
|
+
|
|
649
|
+
This method uses the Google Drive API's export functionality to convert Google Workspace
|
|
650
|
+
files to other formats (e.g., Google Docs to PDF, Google Sheets to Excel).
|
|
651
|
+
For files larger than 10MB, it falls back to using Long-Running Operation (LRO).
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
file_id (str): The ID of the Google Drive file to export.
|
|
655
|
+
download_path (Path): The local path where the exported file should be saved.
|
|
656
|
+
mime_type (str): The target MIME type for the exported file (e.g., 'application/pdf').
|
|
657
|
+
file_size (int): The size of the file to export - used to determine if the
|
|
658
|
+
file is large enough to use LRO instead of direct export endpoint.
|
|
659
|
+
Returns:
|
|
660
|
+
bytes: The exported file content.
|
|
661
|
+
|
|
662
|
+
Raises:
|
|
663
|
+
HttpError: If the export operation fails.
|
|
664
|
+
"""
|
|
665
|
+
from googleapiclient.errors import HttpError
|
|
666
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
667
|
+
|
|
668
|
+
if file_size > LRO_EXPORT_SIZE_THRESHOLD:
|
|
669
|
+
self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
|
|
670
|
+
return
|
|
671
|
+
|
|
672
|
+
with self.connection_config.get_client() as client:
|
|
673
|
+
try:
|
|
674
|
+
# pylint: disable=maybe-no-member
|
|
675
|
+
request = client.export_media(fileId=file_id, mimeType=mime_type)
|
|
676
|
+
with open(download_path, "wb") as file:
|
|
677
|
+
downloader = MediaIoBaseDownload(file, request)
|
|
678
|
+
done = False
|
|
679
|
+
while done is False:
|
|
680
|
+
status, done = downloader.next_chunk()
|
|
681
|
+
logger.debug(f"Download progress: {int(status.progress() * 100)}.")
|
|
682
|
+
except HttpError as error:
|
|
683
|
+
if error.resp.status == 403 and "too large" in error.reason.lower():
|
|
684
|
+
# Even though we have the LRO threashold, for some smaller files the
|
|
685
|
+
# export size might exceed 10MB and we get a 403 error.
|
|
686
|
+
# In that case, we use LRO as a fallback.
|
|
687
|
+
self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
|
|
688
|
+
else:
|
|
689
|
+
raise SourceConnectionError(f"Failed to export file: {error}") from error
|
|
690
|
+
|
|
691
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
692
|
+
@contextmanager
|
|
693
|
+
def _get_files_and_operations_client(
|
|
694
|
+
self,
|
|
695
|
+
) -> Generator[tuple["GoogleAPIResource", "GoogleAPIResource"], None, None]:
|
|
696
|
+
"""
|
|
697
|
+
Returns a context manager for the files and operations clients for the Google Drive API.
|
|
698
|
+
|
|
699
|
+
Yields:
|
|
700
|
+
Tuple[GoogleAPIResource, GoogleAPIResource]: A tuple of the files
|
|
701
|
+
and operations clients.
|
|
702
|
+
"""
|
|
703
|
+
from googleapiclient.discovery import build
|
|
704
|
+
|
|
705
|
+
creds = self._get_credentials()
|
|
706
|
+
service = build("drive", "v3", credentials=creds)
|
|
707
|
+
with (
|
|
708
|
+
service.operations() as operations_client,
|
|
709
|
+
service.files() as files_client,
|
|
710
|
+
):
|
|
711
|
+
yield files_client, operations_client
|
|
712
|
+
|
|
713
|
+
@requires_dependencies(["httpx"])
|
|
714
|
+
def _raw_download_google_drive_file(self, url: str, download_path: Path) -> Path:
|
|
715
|
+
"""
|
|
716
|
+
Streams file content directly to disk using authenticated HTTP request.
|
|
717
|
+
Must use httpx to stream the file to disk as currently there's no google SDK
|
|
718
|
+
functionality to download a file like for get media or export operations.
|
|
719
|
+
|
|
720
|
+
Writes the file to the correct path in the download directory while downloading.
|
|
721
|
+
Avoids buffering large files in memory.
|
|
722
|
+
|
|
723
|
+
Args:
|
|
724
|
+
url (str): The URL of the file to download.
|
|
725
|
+
download_path (Path): The path to save the downloaded file.
|
|
726
|
+
|
|
727
|
+
Returns:
|
|
728
|
+
Path: The path to the downloaded file.
|
|
729
|
+
"""
|
|
730
|
+
import httpx
|
|
731
|
+
from google.auth.transport.requests import Request
|
|
732
|
+
|
|
733
|
+
creds = self._get_credentials()
|
|
734
|
+
|
|
735
|
+
creds.refresh(Request())
|
|
736
|
+
|
|
737
|
+
headers = {
|
|
738
|
+
"Authorization": f"Bearer {creds.token}",
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
with (
|
|
742
|
+
httpx.Client(timeout=None, follow_redirects=True) as client,
|
|
743
|
+
client.stream("GET", url, headers=headers) as response,
|
|
744
|
+
):
|
|
745
|
+
if response.status_code != 200:
|
|
746
|
+
raise SourceConnectionError(
|
|
747
|
+
f"Failed to stream download from {url}: {response.status_code}"
|
|
748
|
+
)
|
|
749
|
+
with open(download_path, "wb") as f:
|
|
750
|
+
for chunk in response.iter_bytes():
|
|
751
|
+
f.write(chunk)
|
|
752
|
+
return download_path
|
|
753
|
+
|
|
754
|
+
@requires_dependencies(["google"], extras="google-drive")
|
|
755
|
+
def _get_credentials(self):
|
|
756
|
+
"""
|
|
757
|
+
Retrieves the credentials for Google Drive API access.
|
|
758
|
+
|
|
759
|
+
Returns:
|
|
760
|
+
Credentials: The credentials for Google Drive API access.
|
|
761
|
+
"""
|
|
762
|
+
from google.oauth2 import service_account
|
|
763
|
+
|
|
764
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
765
|
+
key_data = access_config.get_service_account_key()
|
|
766
|
+
creds = service_account.Credentials.from_service_account_info(
|
|
767
|
+
key_data,
|
|
768
|
+
scopes=["https://www.googleapis.com/auth/drive.readonly"],
|
|
769
|
+
)
|
|
770
|
+
return creds
|
|
771
|
+
|
|
772
|
+
def _download_file(self, file_data: FileData) -> Path:
|
|
773
|
+
"""Downloads a file from Google Drive using either direct download or export based
|
|
774
|
+
on the source file's MIME type.
|
|
775
|
+
|
|
776
|
+
This method determines the appropriate download method based on the file's MIME type:
|
|
777
|
+
- For Google Workspace files (Docs, Sheets, Slides), uses export functionality
|
|
778
|
+
- For other files, uses direct download
|
|
779
|
+
|
|
780
|
+
Args:
|
|
781
|
+
file_data (FileData): The metadata of the file being downloaded.
|
|
782
|
+
|
|
783
|
+
Returns:
|
|
784
|
+
Path: The path to the downloaded file.
|
|
785
|
+
|
|
786
|
+
Raises:
|
|
787
|
+
SourceConnectionError: If the download fails.
|
|
788
|
+
"""
|
|
789
|
+
mime_type = file_data.additional_metadata.get("mimeType", "")
|
|
790
|
+
file_size = int(file_data.additional_metadata.get("size", 0))
|
|
791
|
+
file_id = file_data.identifier
|
|
792
|
+
|
|
793
|
+
download_path = self.get_download_path(file_data)
|
|
794
|
+
if not download_path:
|
|
795
|
+
raise SourceConnectionError(f"Failed to get download path for file {file_id}")
|
|
796
|
+
|
|
797
|
+
if mime_type in GOOGLE_EXPORT_MIME_MAP:
|
|
798
|
+
# For Google Workspace files, use export functionality
|
|
799
|
+
ext = _get_extension(file_data)
|
|
800
|
+
download_path = download_path.with_suffix(ext)
|
|
801
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
802
|
+
export_mime = GOOGLE_EXPORT_MIME_MAP[mime_type]
|
|
803
|
+
self._export_gdrive_native_file(
|
|
804
|
+
file_id=file_id,
|
|
805
|
+
download_path=download_path,
|
|
806
|
+
mime_type=export_mime,
|
|
807
|
+
file_size=file_size,
|
|
808
|
+
)
|
|
809
|
+
file_data.additional_metadata.update(
|
|
810
|
+
{
|
|
811
|
+
"export_mime_type": export_mime,
|
|
812
|
+
"export_extension": ext,
|
|
813
|
+
"download_method": "google_workspace_export",
|
|
814
|
+
}
|
|
815
|
+
)
|
|
816
|
+
else:
|
|
817
|
+
# For other files, use direct download
|
|
818
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
819
|
+
self._direct_download_file(file_id=file_id, download_path=download_path)
|
|
820
|
+
file_data.additional_metadata.update(
|
|
821
|
+
{
|
|
822
|
+
"download_method": "direct_download",
|
|
823
|
+
}
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
return download_path
|
|
827
|
+
|
|
828
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
829
|
+
mime_type = file_data.additional_metadata.get("mimeType", "")
|
|
830
|
+
|
|
831
|
+
logger.debug(
|
|
832
|
+
f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
download_path = self._download_file(file_data)
|
|
836
|
+
|
|
837
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
838
|
+
|
|
839
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
google_drive_source_entry = SourceRegistryEntry(
|
|
843
|
+
connection_config=GoogleDriveConnectionConfig,
|
|
844
|
+
indexer_config=GoogleDriveIndexerConfig,
|
|
845
|
+
indexer=GoogleDriveIndexer,
|
|
846
|
+
downloader_config=GoogleDriveDownloaderConfig,
|
|
847
|
+
downloader=GoogleDriveDownloader,
|
|
848
|
+
)
|