unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,848 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
6
+
7
+ from dateutil import parser
8
+ from pydantic import Field, Secret
9
+ from pydantic.functional_validators import BeforeValidator
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ FileData,
13
+ FileDataSourceMetadata,
14
+ SourceIdentifiers,
15
+ )
16
+ from unstructured_ingest.error import SourceConnectionError, UserAuthError, ValueError
17
+ from unstructured_ingest.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
23
+ Indexer,
24
+ IndexerConfig,
25
+ )
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
28
+ from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
29
+ from unstructured_ingest.utils.dep_check import requires_dependencies
30
+
31
+ if TYPE_CHECKING:
32
+ from googleapiclient.discovery import Resource as GoogleAPIResource
33
+
34
+ CONNECTOR_TYPE = "google_drive"
35
+
36
+
37
+ # Maps Google-native Drive MIME types → export MIME types
38
+ GOOGLE_EXPORT_MIME_MAP = {
39
+ "application/vnd.google-apps.document": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # noqa: E501
40
+ "application/vnd.google-apps.spreadsheet": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
41
+ "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", # noqa: E501
42
+ }
43
+
44
+ # Maps export MIME types → file extensions
45
+ EXPORT_EXTENSION_MAP = {
46
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
47
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
48
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
49
+ "application/pdf": ".pdf",
50
+ "text/html": ".html",
51
+ }
52
+
53
+ # LRO Export Size Threshold is 10MB in real but the exported file might be slightly larger
54
+ # than the original Google Workspace file - thus the threshold is set to 9MB
55
+ LRO_EXPORT_SIZE_THRESHOLD = 9 * 1024 * 1024 # 9MB
56
+
57
+
58
+ class GoogleDriveAccessConfig(AccessConfig):
59
+ service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
60
+ default=None, description="Credentials values to use for authentication"
61
+ )
62
+ service_account_key_path: Optional[Path] = Field(
63
+ default=None,
64
+ description="File path to credentials values to use for authentication",
65
+ )
66
+
67
+ def model_post_init(self, __context: Any) -> None:
68
+ if self.service_account_key is None and self.service_account_key_path is None:
69
+ raise ValueError(
70
+ "either service_account_key or service_account_key_path must be provided"
71
+ )
72
+
73
+ def get_service_account_key(self) -> dict:
74
+ key_data = None
75
+ if self.service_account_key_path:
76
+ with self.service_account_key_path.open() as f:
77
+ key_data = json.load(f)
78
+ if key_data and self.service_account_key:
79
+ if key_data == self.service_account_key:
80
+ return key_data
81
+ else:
82
+ raise ValueError(
83
+ "service_account_key and service_account_key_path "
84
+ "both provided and have different values"
85
+ )
86
+ if key_data:
87
+ return key_data
88
+ return self.service_account_key
89
+
90
+
91
+ class GoogleDriveConnectionConfig(ConnectionConfig):
92
+ drive_id: str = Field(description="Google Drive File or Folder ID.")
93
+ access_config: Secret[GoogleDriveAccessConfig]
94
+
95
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
96
+ @contextmanager
97
+ def get_client(self) -> Generator["GoogleAPIResource", None, None]:
98
+ from google.auth import exceptions
99
+ from google.oauth2 import service_account
100
+ from googleapiclient.discovery import build
101
+ from googleapiclient.errors import HttpError
102
+
103
+ access_config = self.access_config.get_secret_value()
104
+ key_data = access_config.get_service_account_key()
105
+
106
+ try:
107
+ creds = service_account.Credentials.from_service_account_info(key_data)
108
+ service = build("drive", "v3", credentials=creds)
109
+ with service.files() as client:
110
+ yield client
111
+ except HttpError as exc:
112
+ raise ValueError(f"{exc.reason}")
113
+ except exceptions.DefaultCredentialsError:
114
+ raise UserAuthError("The provided API key is invalid.")
115
+
116
+
117
+ class GoogleDriveIndexerConfig(IndexerConfig):
118
+ extensions: Optional[list[str]] = None
119
+ recursive: bool = False
120
+
121
+ def model_post_init(self, __context: Any) -> None:
122
+ if self.extensions is not None:
123
+ self.extensions = [e.lstrip(".") for e in self.extensions]
124
+
125
+
126
+ @dataclass
127
+ class GoogleDriveIndexer(Indexer):
128
+ connection_config: GoogleDriveConnectionConfig
129
+ index_config: GoogleDriveIndexerConfig
130
+ fields: list[str] = field(
131
+ default_factory=lambda: [
132
+ "id",
133
+ "name",
134
+ "mimeType",
135
+ "fileExtension",
136
+ "md5Checksum",
137
+ "sha1Checksum",
138
+ "sha256Checksum",
139
+ "headRevisionId",
140
+ "permissions",
141
+ "createdTime",
142
+ "modifiedTime",
143
+ "version",
144
+ "originalFilename",
145
+ "capabilities",
146
+ "permissionIds",
147
+ "size",
148
+ ]
149
+ )
150
+
151
+ @staticmethod
152
+ def verify_drive_api_enabled(client) -> None:
153
+ from googleapiclient.errors import HttpError
154
+
155
+ """
156
+ Makes a lightweight API call to verify that the Drive API is enabled.
157
+ If the API is not enabled, an HttpError should be raised.
158
+ """
159
+ try:
160
+ # A very minimal call: list 1 file from the drive.
161
+ client.list(
162
+ supportsAllDrives=True,
163
+ includeItemsFromAllDrives=True,
164
+ spaces="drive",
165
+ pageSize=1,
166
+ fields="files(id)",
167
+ ).execute()
168
+ except HttpError as e:
169
+ error_content = e.content.decode() if hasattr(e, "content") else ""
170
+ lower_error = error_content.lower()
171
+ if "drive api" in lower_error and (
172
+ "not enabled" in lower_error or "not been used" in lower_error
173
+ ):
174
+ raise SourceConnectionError(
175
+ "Google Drive API is not enabled for your project. \
176
+ Please enable it in the Google Cloud Console."
177
+ )
178
+ else:
179
+ raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
180
+
181
+ @staticmethod
182
+ def count_files_recursively(
183
+ files_client: "GoogleAPIResource", folder_id: str, extensions: list[str] = None
184
+ ) -> int:
185
+ """
186
+ Count non-folder files recursively under the given folder.
187
+ If `extensions` is provided, only count files
188
+ whose `fileExtension` matches one of the values.
189
+ """
190
+ count = 0
191
+ stack = [folder_id]
192
+ # Pre-compute lower-case extension set for O(1) lookup
193
+ valid_exts = set(e.lower() for e in extensions) if extensions else None
194
+
195
+ while stack:
196
+ current_folder = stack.pop()
197
+ # Always list all items under the current folder.
198
+ query = f"'{current_folder}' in parents"
199
+ page_token = None
200
+ while True:
201
+ response = files_client.list(
202
+ supportsAllDrives=True,
203
+ includeItemsFromAllDrives=True,
204
+ spaces="drive",
205
+ q=query,
206
+ fields="nextPageToken, files(id, mimeType, fileExtension)",
207
+ pageToken=page_token,
208
+ pageSize=1000,
209
+ ).execute()
210
+ for item in response.get("files", []):
211
+ if item.get("mimeType") == "application/vnd.google-apps.folder":
212
+ # Always traverse sub-folders regardless of extension filter.
213
+ stack.append(item["id"])
214
+ else:
215
+ if extensions:
216
+ # Use a case-insensitive comparison for the file extension.
217
+ file_ext = (item.get("fileExtension") or "").lower()
218
+ if file_ext in valid_exts:
219
+ count += 1
220
+ else:
221
+ count += 1
222
+ page_token = response.get("nextPageToken")
223
+ if not page_token:
224
+ break
225
+ return count
226
+
227
+ def precheck(self) -> None:
228
+ """
229
+ Enhanced precheck that verifies not only connectivity
230
+ but also that the provided drive_id is valid and accessible.
231
+ """
232
+ try:
233
+ with self.connection_config.get_client() as client:
234
+ # First, verify that the Drive API is enabled.
235
+ self.verify_drive_api_enabled(client)
236
+
237
+ # Try to retrieve metadata for the drive id.
238
+ # This will catch errors such as an invalid drive id or insufficient permissions.
239
+ root_info = self.get_root_info(
240
+ files_client=client, object_id=self.connection_config.drive_id
241
+ )
242
+ logger.info(
243
+ f"Successfully retrieved drive root info: "
244
+ f"{root_info.get('name', 'Unnamed')} (ID: {root_info.get('id')})"
245
+ )
246
+
247
+ # If the target is a folder, perform file count check.
248
+ if self.is_dir(root_info):
249
+ if self.index_config.recursive:
250
+ file_count = self.count_files_recursively(
251
+ client,
252
+ self.connection_config.drive_id,
253
+ extensions=self.index_config.extensions,
254
+ )
255
+ if file_count == 0:
256
+ logger.warning(
257
+ "Empty folder: no files found recursively in the folder. \
258
+ Please verify that the folder contains files and \
259
+ that the service account has proper permissions."
260
+ )
261
+ # raise SourceConnectionError(
262
+ # "Empty folder: no files found recursively in the folder. "
263
+ # "Please verify that the folder contains files and \
264
+ # that the service account has proper permissions."
265
+ # )
266
+ else:
267
+ logger.info(f"Found {file_count} files recursively in the folder.")
268
+ else:
269
+ # Non-recursive: check for at least one immediate non-folder child.
270
+ response = client.list(
271
+ supportsAllDrives=True,
272
+ includeItemsFromAllDrives=True,
273
+ spaces="drive",
274
+ fields="files(id)",
275
+ pageSize=1,
276
+ q=f"'{self.connection_config.drive_id}' in parents",
277
+ ).execute()
278
+ if not response.get("files"):
279
+ logger.warning(
280
+ "Empty folder: no files found at the folder's root level. "
281
+ "Please verify that the folder contains files and \
282
+ that the service account has proper permissions."
283
+ )
284
+ # raise SourceConnectionError(
285
+ # "Empty folder: no files found at the folder's root level. "
286
+ # "Please verify that the folder contains files and \
287
+ # that the service account has proper permissions."
288
+ # )
289
+ else:
290
+ logger.info("Found files at the folder's root level.")
291
+ else:
292
+ # If the target is a file, precheck passes.
293
+ logger.info("Drive ID corresponds to a file. Precheck passed.")
294
+
295
+ except Exception as e:
296
+ logger.error(
297
+ "Failed to validate Google Drive connection during precheck",
298
+ exc_info=True,
299
+ )
300
+ raise SourceConnectionError(f"Precheck failed: {e}")
301
+
302
+ @staticmethod
303
+ def is_dir(record: dict) -> bool:
304
+ return record.get("mimeType") == "application/vnd.google-apps.folder"
305
+
306
+ @staticmethod
307
+ def map_file_data(root_info: dict) -> FileData:
308
+ file_id = root_info["id"]
309
+ filename = root_info.pop("name")
310
+ url = root_info.pop("webContentLink", None)
311
+ version = root_info.pop("version", None)
312
+ permissions = root_info.pop("permissions", None)
313
+ date_created_str = root_info.pop("createdTime", None)
314
+ date_created_dt = parser.parse(date_created_str) if date_created_str else None
315
+ date_modified_str = root_info.pop("modifiedTime", None)
316
+ parent_path = root_info.pop("parent_path", None)
317
+ parent_root_path = root_info.pop("parent_root_path", None)
318
+ date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
319
+ if (
320
+ parent_path
321
+ and isinstance(parent_path, str)
322
+ and parent_root_path
323
+ and isinstance(parent_root_path, str)
324
+ ):
325
+ fullpath = f"{parent_path}/{filename}"
326
+ rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
327
+ source_identifiers = SourceIdentifiers(
328
+ filename=filename, fullpath=fullpath, rel_path=rel_path
329
+ )
330
+ else:
331
+ source_identifiers = SourceIdentifiers(fullpath=filename, filename=filename)
332
+ return FileData(
333
+ connector_type=CONNECTOR_TYPE,
334
+ identifier=file_id,
335
+ source_identifiers=source_identifiers,
336
+ metadata=FileDataSourceMetadata(
337
+ url=url,
338
+ version=version,
339
+ date_created=str(date_created_dt.timestamp()),
340
+ date_modified=str(date_modified_dt.timestamp()),
341
+ permissions_data=permissions,
342
+ record_locator={
343
+ "file_id": file_id,
344
+ },
345
+ ),
346
+ additional_metadata=root_info,
347
+ display_name=source_identifiers.fullpath,
348
+ )
349
+
350
+ def get_paginated_results(
351
+ self,
352
+ files_client,
353
+ object_id: str,
354
+ extensions: Optional[list[str]] = None,
355
+ recursive: bool = False,
356
+ previous_path: Optional[str] = None,
357
+ ) -> list[dict]:
358
+ fields_input = "nextPageToken, files({})".format(",".join(self.fields))
359
+ q = f"'{object_id}' in parents"
360
+ # Filter by extension but still include any directories
361
+ if extensions:
362
+ ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
363
+ q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
364
+ logger.debug(f"query used when indexing: {q}")
365
+ logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
366
+ done = False
367
+ page_token = None
368
+ files_response = []
369
+ while not done:
370
+ response: dict = files_client.list(
371
+ supportsAllDrives=True,
372
+ includeItemsFromAllDrives=True,
373
+ spaces="drive",
374
+ fields=fields_input,
375
+ corpora="user",
376
+ pageToken=page_token,
377
+ q=q,
378
+ ).execute()
379
+ if files := response.get("files", []):
380
+ fs = [f for f in files if not self.is_dir(record=f)]
381
+ for r in fs:
382
+ r["parent_path"] = previous_path
383
+ dirs = [f for f in files if self.is_dir(record=f)]
384
+ files_response.extend(fs)
385
+ if recursive:
386
+ for d in dirs:
387
+ dir_id = d["id"]
388
+ dir_name = d["name"]
389
+ files_response.extend(
390
+ self.get_paginated_results(
391
+ files_client=files_client,
392
+ object_id=dir_id,
393
+ extensions=extensions,
394
+ recursive=recursive,
395
+ previous_path=f"{previous_path}/{dir_name}",
396
+ )
397
+ )
398
+ page_token = response.get("nextPageToken")
399
+ if page_token is None:
400
+ done = True
401
+ for r in files_response:
402
+ r["parent_root_path"] = previous_path
403
+ return files_response
404
+
405
+ def get_root_info(self, files_client, object_id: str) -> dict:
406
+ return files_client.get(
407
+ supportsAllDrives=True, fileId=object_id, fields=",".join(self.fields)
408
+ ).execute()
409
+
410
+ def get_files(
411
+ self,
412
+ files_client,
413
+ object_id: str,
414
+ recursive: bool = False,
415
+ extensions: Optional[list[str]] = None,
416
+ ) -> list[FileData]:
417
+ root_info = self.get_root_info(files_client=files_client, object_id=object_id)
418
+ if not self.is_dir(root_info):
419
+ root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
420
+ data = [self.map_file_data(root_info)]
421
+ else:
422
+ file_contents = self.get_paginated_results(
423
+ files_client=files_client,
424
+ object_id=object_id,
425
+ extensions=extensions,
426
+ recursive=recursive,
427
+ previous_path=root_info["name"],
428
+ )
429
+ data = []
430
+ for f in file_contents:
431
+ f["permissions"] = self.extract_permissions(f.get("permissions"))
432
+ data.append(self.map_file_data(root_info=f))
433
+ for d in data:
434
+ d.metadata.record_locator["drive_id"]: object_id
435
+ return data
436
+
437
+ def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
438
+ if not permissions:
439
+ logger.debug("no permissions found")
440
+ return [{}]
441
+
442
+ # https://developers.google.com/workspace/drive/api/guides/ref-roles
443
+ role_mapping = {
444
+ "owner": ["read", "update", "delete"],
445
+ "organizer": ["read", "update", "delete"],
446
+ "fileOrganizer": ["read", "update"],
447
+ "writer": ["read", "update"],
448
+ "commenter": ["read"],
449
+ "reader": ["read"],
450
+ }
451
+
452
+ normalized_permissions = {
453
+ "read": {"users": set(), "groups": set()},
454
+ "update": {"users": set(), "groups": set()},
455
+ "delete": {"users": set(), "groups": set()},
456
+ }
457
+
458
+ for item in permissions:
459
+ # https://developers.google.com/workspace/drive/api/reference/rest/v3/permissions
460
+ # ignore permissions for "anyone" and "domain"
461
+ if item["type"] in ["user", "group"]:
462
+ type_key = item["type"] + "s"
463
+ for operation in role_mapping[item["role"]]:
464
+ normalized_permissions[operation][type_key].add(item["id"])
465
+
466
+ # turn sets into sorted lists for consistency and json serialization
467
+ for role_dict in normalized_permissions.values():
468
+ for key in role_dict:
469
+ role_dict[key] = sorted(role_dict[key])
470
+
471
+ logger.debug(f"normalized permissions generated: {normalized_permissions}")
472
+ return [{k: v} for k, v in normalized_permissions.items()]
473
+
474
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
475
+ with self.connection_config.get_client() as client:
476
+ for f in self.get_files(
477
+ files_client=client,
478
+ object_id=self.connection_config.drive_id,
479
+ recursive=self.index_config.recursive,
480
+ extensions=self.index_config.extensions,
481
+ ):
482
+ yield f
483
+
484
+
485
+ class GoogleDriveDownloaderConfig(DownloaderConfig):
486
+ lro_max_tries: int = 10
487
+ lro_max_time: int = 10 * 60 # 10 minutes
488
+
489
+
490
+ def _get_extension(file_data: FileData) -> str:
491
+ """
492
+ Returns the extension for a given source MIME type.
493
+ """
494
+ source_mime_type = file_data.additional_metadata.get("export_mime_type", "")
495
+ export_mime_type = GOOGLE_EXPORT_MIME_MAP.get(source_mime_type, "")
496
+ if export_mime_type:
497
+ return EXPORT_EXTENSION_MAP.get(export_mime_type, "")
498
+ return ""
499
+
500
+
501
+ @dataclass
502
+ class GoogleDriveDownloader(Downloader):
503
+ """
504
+ Downloads files from Google Drive using googleapis client. For native files, it uses the export
505
+ functionality for files <10MB and LRO (Long Running Operation) for files >10MB.
506
+ """
507
+
508
+ connection_config: GoogleDriveConnectionConfig
509
+ download_config: GoogleDriveDownloaderConfig = field(
510
+ default_factory=lambda: GoogleDriveDownloaderConfig()
511
+ )
512
+ connector_type: str = CONNECTOR_TYPE
513
+
514
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
515
+ def _direct_download_file(self, file_id, download_path: Path):
516
+ """Downloads a file from Google Drive using the Drive API's media download functionality.
517
+ The method uses Google Drive API's media download functionality to stream the file
518
+ content directly to disk.
519
+
520
+ Args:
521
+ file_id (str): The ID of the file to download from Google Drive.
522
+ download_path (Path): The local path where the file should be saved.
523
+
524
+ Raises:
525
+ SourceConnectionError: If the download operation fails.
526
+ """
527
+ from googleapiclient.errors import HttpError
528
+ from googleapiclient.http import MediaIoBaseDownload
529
+
530
+ try:
531
+ with self.connection_config.get_client() as client:
532
+ # pylint: disable=maybe-no-member
533
+ request = client.get_media(fileId=file_id)
534
+
535
+ with open(download_path, "wb") as file:
536
+ downloader = MediaIoBaseDownload(file, request)
537
+ done = False
538
+ while done is False:
539
+ status, done = downloader.next_chunk()
540
+ logger.debug(f"Download progress:{int(status.progress() * 100)}.")
541
+
542
+ except (HttpError, ValueError) as error:
543
+ logger.exception(f"Error downloading file {file_id} to {download_path}: {error}")
544
+ raise SourceConnectionError("Failed to download file") from error
545
+
546
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
547
+ def _export_gdrive_file_with_lro(self, file_id: str, download_path: Path, mime_type: str):
548
+ """Exports a Google Drive file using Long-Running Operation (LRO) for large files
549
+ (>10MB of the exported file size).
550
+
551
+ This method is used when the standard export method fails due to file size limitations.
552
+ It uses the Drive API's LRO functionality to handle large file exports.
553
+
554
+ Args:
555
+ file_id (str): The ID of the Google Drive file to export.
556
+ download_path (Path): The local path where the exported file should be saved.
557
+ mime_type (str): The target MIME type for the exported file.
558
+ Raises:
559
+ SourceConnectionError: If the export operation fails.
560
+ """
561
+
562
+ import tenacity
563
+ from googleapiclient.errors import HttpError
564
+
565
+ max_time = self.download_config.lro_max_time
566
+ max_tries = self.download_config.lro_max_tries
567
+
568
+ class OperationNotFinished(Exception):
569
+ """
570
+ Exception raised when the operation is not finished.
571
+ """
572
+
573
+ pass
574
+
575
+ def is_fatal_code(e: Exception) -> bool:
576
+ """
577
+ Returns True if the error is fatal and should not be retried.
578
+ 403 and 429 can mean "Too many requests" or "User rate limit exceeded"
579
+ which should be retried.
580
+ """
581
+ return (
582
+ isinstance(e, HttpError)
583
+ and 400 <= e.resp.status < 500
584
+ and e.resp.status not in [403, 429]
585
+ )
586
+
587
+ @tenacity.retry(
588
+ wait=tenacity.wait_exponential(),
589
+ retry=tenacity.retry_if_exception(
590
+ lambda e: (
591
+ isinstance(e, (HttpError, OperationNotFinished)) and not is_fatal_code(e)
592
+ )
593
+ ),
594
+ stop=(tenacity.stop_after_attempt(max_tries) | tenacity.stop_after_delay(max_time)),
595
+ )
596
+ def _poll_operation(operation: dict, operations_client: "GoogleAPIResource") -> dict:
597
+ """
598
+ Helper function to poll the operation until it's complete.
599
+ Uses backoff exponential retry logic.
600
+
601
+ Each `operations.get` call uses the Google API requests limit. Details:
602
+ https://developers.google.com/workspace/drive/api/guides/limits
603
+
604
+ The limits as of May 2025 are:
605
+ - 12.000 calls per 60 seconds
606
+
607
+ In case of request limitting, the API will return 403 `User rate limit exceeded` error
608
+ or 429 `Too many requests` error.
609
+ """
610
+ if operation.get("done", False):
611
+ return operation
612
+ if "error" in operation:
613
+ raise SourceConnectionError(
614
+ f"Export operation failed: {operation['error']['message']}"
615
+ )
616
+ # Refresh the operation status:
617
+ # FYI: In some cases the `operations.get` call errors with 403 "User does not have
618
+ # permission" error even if the same user create the operation with `download` method.
619
+ updated_operation = operations_client.get(name=operation["name"]).execute()
620
+ if not updated_operation.get("done", False):
621
+ raise OperationNotFinished()
622
+ return updated_operation
623
+
624
+ try:
625
+ with self._get_files_and_operations_client() as (files_client, operations_client):
626
+ # Start the LRO
627
+ operation = files_client.download(fileId=file_id, mimeType=mime_type).execute()
628
+
629
+ # In case the operation is not finished, poll it until it's complete
630
+ updated_operation = _poll_operation(operation, operations_client)
631
+
632
+ # Get the download URI from the completed operation
633
+ download_uri = updated_operation["response"]["downloadUri"]
634
+
635
+ # Download the file using the URI
636
+ self._raw_download_google_drive_file(download_uri, download_path)
637
+
638
+ except HttpError as error:
639
+ raise SourceConnectionError(
640
+ f"Failed to export file using Google Drive LRO: {error}"
641
+ ) from error
642
+
643
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
644
+ def _export_gdrive_native_file(
645
+ self, file_id: str, download_path: Path, mime_type: str, file_size: int
646
+ ):
647
+ """Exports a Google Drive native file (Docs, Sheets, Slides) to a specified format.
648
+
649
+ This method uses the Google Drive API's export functionality to convert Google Workspace
650
+ files to other formats (e.g., Google Docs to PDF, Google Sheets to Excel).
651
+ For files larger than 10MB, it falls back to using Long-Running Operation (LRO).
652
+
653
+ Args:
654
+ file_id (str): The ID of the Google Drive file to export.
655
+ download_path (Path): The local path where the exported file should be saved.
656
+ mime_type (str): The target MIME type for the exported file (e.g., 'application/pdf').
657
+ file_size (int): The size of the file to export - used to determine if the
658
+ file is large enough to use LRO instead of direct export endpoint.
659
+ Returns:
660
+ bytes: The exported file content.
661
+
662
+ Raises:
663
+ HttpError: If the export operation fails.
664
+ """
665
+ from googleapiclient.errors import HttpError
666
+ from googleapiclient.http import MediaIoBaseDownload
667
+
668
+ if file_size > LRO_EXPORT_SIZE_THRESHOLD:
669
+ self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
670
+ return
671
+
672
+ with self.connection_config.get_client() as client:
673
+ try:
674
+ # pylint: disable=maybe-no-member
675
+ request = client.export_media(fileId=file_id, mimeType=mime_type)
676
+ with open(download_path, "wb") as file:
677
+ downloader = MediaIoBaseDownload(file, request)
678
+ done = False
679
+ while done is False:
680
+ status, done = downloader.next_chunk()
681
+ logger.debug(f"Download progress: {int(status.progress() * 100)}.")
682
+ except HttpError as error:
683
+ if error.resp.status == 403 and "too large" in error.reason.lower():
684
+ # Even though we have the LRO threashold, for some smaller files the
685
+ # export size might exceed 10MB and we get a 403 error.
686
+ # In that case, we use LRO as a fallback.
687
+ self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
688
+ else:
689
+ raise SourceConnectionError(f"Failed to export file: {error}") from error
690
+
691
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
692
+ @contextmanager
693
+ def _get_files_and_operations_client(
694
+ self,
695
+ ) -> Generator[tuple["GoogleAPIResource", "GoogleAPIResource"], None, None]:
696
+ """
697
+ Returns a context manager for the files and operations clients for the Google Drive API.
698
+
699
+ Yields:
700
+ Tuple[GoogleAPIResource, GoogleAPIResource]: A tuple of the files
701
+ and operations clients.
702
+ """
703
+ from googleapiclient.discovery import build
704
+
705
+ creds = self._get_credentials()
706
+ service = build("drive", "v3", credentials=creds)
707
+ with (
708
+ service.operations() as operations_client,
709
+ service.files() as files_client,
710
+ ):
711
+ yield files_client, operations_client
712
+
713
+ @requires_dependencies(["httpx"])
714
+ def _raw_download_google_drive_file(self, url: str, download_path: Path) -> Path:
715
+ """
716
+ Streams file content directly to disk using authenticated HTTP request.
717
+ Must use httpx to stream the file to disk as currently there's no google SDK
718
+ functionality to download a file like for get media or export operations.
719
+
720
+ Writes the file to the correct path in the download directory while downloading.
721
+ Avoids buffering large files in memory.
722
+
723
+ Args:
724
+ url (str): The URL of the file to download.
725
+ download_path (Path): The path to save the downloaded file.
726
+
727
+ Returns:
728
+ Path: The path to the downloaded file.
729
+ """
730
+ import httpx
731
+ from google.auth.transport.requests import Request
732
+
733
+ creds = self._get_credentials()
734
+
735
+ creds.refresh(Request())
736
+
737
+ headers = {
738
+ "Authorization": f"Bearer {creds.token}",
739
+ }
740
+
741
+ with (
742
+ httpx.Client(timeout=None, follow_redirects=True) as client,
743
+ client.stream("GET", url, headers=headers) as response,
744
+ ):
745
+ if response.status_code != 200:
746
+ raise SourceConnectionError(
747
+ f"Failed to stream download from {url}: {response.status_code}"
748
+ )
749
+ with open(download_path, "wb") as f:
750
+ for chunk in response.iter_bytes():
751
+ f.write(chunk)
752
+ return download_path
753
+
754
+ @requires_dependencies(["google"], extras="google-drive")
755
+ def _get_credentials(self):
756
+ """
757
+ Retrieves the credentials for Google Drive API access.
758
+
759
+ Returns:
760
+ Credentials: The credentials for Google Drive API access.
761
+ """
762
+ from google.oauth2 import service_account
763
+
764
+ access_config = self.connection_config.access_config.get_secret_value()
765
+ key_data = access_config.get_service_account_key()
766
+ creds = service_account.Credentials.from_service_account_info(
767
+ key_data,
768
+ scopes=["https://www.googleapis.com/auth/drive.readonly"],
769
+ )
770
+ return creds
771
+
772
+ def _download_file(self, file_data: FileData) -> Path:
773
+ """Downloads a file from Google Drive using either direct download or export based
774
+ on the source file's MIME type.
775
+
776
+ This method determines the appropriate download method based on the file's MIME type:
777
+ - For Google Workspace files (Docs, Sheets, Slides), uses export functionality
778
+ - For other files, uses direct download
779
+
780
+ Args:
781
+ file_data (FileData): The metadata of the file being downloaded.
782
+
783
+ Returns:
784
+ Path: The path to the downloaded file.
785
+
786
+ Raises:
787
+ SourceConnectionError: If the download fails.
788
+ """
789
+ mime_type = file_data.additional_metadata.get("mimeType", "")
790
+ file_size = int(file_data.additional_metadata.get("size", 0))
791
+ file_id = file_data.identifier
792
+
793
+ download_path = self.get_download_path(file_data)
794
+ if not download_path:
795
+ raise SourceConnectionError(f"Failed to get download path for file {file_id}")
796
+
797
+ if mime_type in GOOGLE_EXPORT_MIME_MAP:
798
+ # For Google Workspace files, use export functionality
799
+ ext = _get_extension(file_data)
800
+ download_path = download_path.with_suffix(ext)
801
+ download_path.parent.mkdir(parents=True, exist_ok=True)
802
+ export_mime = GOOGLE_EXPORT_MIME_MAP[mime_type]
803
+ self._export_gdrive_native_file(
804
+ file_id=file_id,
805
+ download_path=download_path,
806
+ mime_type=export_mime,
807
+ file_size=file_size,
808
+ )
809
+ file_data.additional_metadata.update(
810
+ {
811
+ "export_mime_type": export_mime,
812
+ "export_extension": ext,
813
+ "download_method": "google_workspace_export",
814
+ }
815
+ )
816
+ else:
817
+ # For other files, use direct download
818
+ download_path.parent.mkdir(parents=True, exist_ok=True)
819
+ self._direct_download_file(file_id=file_id, download_path=download_path)
820
+ file_data.additional_metadata.update(
821
+ {
822
+ "download_method": "direct_download",
823
+ }
824
+ )
825
+
826
+ return download_path
827
+
828
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
829
+ mime_type = file_data.additional_metadata.get("mimeType", "")
830
+
831
+ logger.debug(
832
+ f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
833
+ )
834
+
835
+ download_path = self._download_file(file_data)
836
+
837
+ file_data.local_download_path = str(download_path.resolve())
838
+
839
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
840
+
841
+
842
+ google_drive_source_entry = SourceRegistryEntry(
843
+ connection_config=GoogleDriveConnectionConfig,
844
+ indexer_config=GoogleDriveIndexerConfig,
845
+ indexer=GoogleDriveIndexer,
846
+ downloader_config=GoogleDriveDownloaderConfig,
847
+ downloader=GoogleDriveDownloader,
848
+ )