unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,485 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
9
+
10
+ from dateutil import parser
11
+ from pydantic import Field, Secret
12
+
13
+ from unstructured_ingest.data_types.file_data import (
14
+ FileData,
15
+ FileDataSourceMetadata,
16
+ SourceIdentifiers,
17
+ )
18
+ from unstructured_ingest.error import (
19
+ DestinationConnectionError,
20
+ SourceConnectionError,
21
+ SourceConnectionNetworkError,
22
+ UserAuthError,
23
+ ValueError,
24
+ )
25
+ from unstructured_ingest.interfaces import (
26
+ AccessConfig,
27
+ ConnectionConfig,
28
+ Downloader,
29
+ DownloaderConfig,
30
+ DownloadResponse,
31
+ Indexer,
32
+ IndexerConfig,
33
+ Uploader,
34
+ UploaderConfig,
35
+ )
36
+ from unstructured_ingest.logger import logger
37
+ from unstructured_ingest.processes.connector_registry import (
38
+ DestinationRegistryEntry,
39
+ SourceRegistryEntry,
40
+ )
41
+ from unstructured_ingest.processes.utils.blob_storage import (
42
+ BlobStoreUploadStager,
43
+ BlobStoreUploadStagerConfig,
44
+ )
45
+ from unstructured_ingest.utils.dep_check import requires_dependencies
46
+
47
+ if TYPE_CHECKING:
48
+ from office365.graph_client import GraphClient
49
+ from office365.onedrive.driveitems.driveItem import DriveItem
50
+ from office365.onedrive.drives.drive import Drive
51
+
52
+ CONNECTOR_TYPE = "onedrive"
53
+ MAX_BYTES_SIZE = 512_000_000
54
+
55
+
56
+ class OnedriveAccessConfig(AccessConfig):
57
+ client_cred: str = Field(description="Microsoft App client secret")
58
+ password: Optional[str] = Field(description="Service account password", default=None)
59
+
60
+
61
+ class OnedriveConnectionConfig(ConnectionConfig):
62
+ client_id: str = Field(description="Microsoft app client ID")
63
+ user_pname: str = Field(
64
+ description="User principal name or service account, usually your Azure AD email."
65
+ )
66
+ tenant: str = Field(
67
+ repr=False, description="ID or domain name associated with your Azure AD instance"
68
+ )
69
+ authority_url: Optional[str] = Field(
70
+ repr=False,
71
+ default="https://login.microsoftonline.com",
72
+ examples=["https://login.microsoftonline.com"],
73
+ description="Authentication token provider for Microsoft apps",
74
+ )
75
+ access_config: Secret[OnedriveAccessConfig]
76
+
77
+ def get_drive(self) -> "Drive":
78
+ client = self.get_client()
79
+ drive = client.users[self.user_pname].drive
80
+ return drive
81
+
82
+ @requires_dependencies(["msal", "requests"], extras="onedrive")
83
+ def get_token(self):
84
+ from msal import ConfidentialClientApplication
85
+ from requests import post
86
+
87
+ if self.access_config.get_secret_value().password:
88
+ url = f"https://login.microsoftonline.com/{self.tenant}/oauth2/v2.0/token"
89
+ headers = {"Content-Type": "application/x-www-form-urlencoded"}
90
+ data = {
91
+ "grant_type": "password",
92
+ "username": self.user_pname,
93
+ "password": self.access_config.get_secret_value().password,
94
+ "client_id": self.client_id,
95
+ "client_secret": self.access_config.get_secret_value().client_cred,
96
+ "scope": "https://graph.microsoft.com/.default",
97
+ }
98
+ response = post(url, headers=headers, data=data)
99
+ if response.status_code == 200:
100
+ return response.json()
101
+ else:
102
+ raise SourceConnectionError(
103
+ f"Oauth2 authentication failed with {response.status_code}: {response.text}"
104
+ )
105
+
106
+ else:
107
+ try:
108
+ app = ConfidentialClientApplication(
109
+ authority=f"{self.authority_url}/{self.tenant}",
110
+ client_id=self.client_id,
111
+ client_credential=self.access_config.get_secret_value().client_cred,
112
+ )
113
+ token = app.acquire_token_for_client(
114
+ scopes=["https://graph.microsoft.com/.default"]
115
+ )
116
+ except ValueError as exc:
117
+ logger.error("Couldn't set up credentials.")
118
+ raise exc
119
+
120
+ if "error" in token:
121
+ error_codes = token.get("error_codes", [])
122
+ error_type = token.get("error", "")
123
+ error_description = token.get("error_description", "")
124
+
125
+ # 7000215: Invalid client secret provided
126
+ # 7000218: Invalid client id provided
127
+ # 700016: Application not found in directory
128
+ # 90002: Tenant not found
129
+ auth_error_codes = [7000215, 7000218, 700016, 90002]
130
+
131
+ if any(code in error_codes for code in auth_error_codes) or error_type in [
132
+ "invalid_client",
133
+ "unauthorized_client",
134
+ "invalid_grant",
135
+ ]:
136
+ raise UserAuthError(f"Authentication failed: {error_type}: {error_description}")
137
+ else:
138
+ raise SourceConnectionNetworkError(
139
+ f"Failed to fetch token: {error_type}: {error_description}"
140
+ )
141
+ return token
142
+
143
+ @requires_dependencies(["office365"], extras="onedrive")
144
+ def get_client(self) -> "GraphClient":
145
+ from office365.graph_client import GraphClient
146
+
147
+ client = GraphClient(self.get_token)
148
+ return client
149
+
150
+
151
+ class OnedriveIndexerConfig(IndexerConfig):
152
+ path: Optional[str] = Field(default="")
153
+ recursive: bool = False
154
+
155
+
156
+ @dataclass
157
+ class OnedriveIndexer(Indexer):
158
+ connection_config: OnedriveConnectionConfig
159
+ index_config: OnedriveIndexerConfig
160
+ connector_type: str = CONNECTOR_TYPE
161
+
162
+ def precheck(self) -> None:
163
+ try:
164
+ token_resp: dict = self.connection_config.get_token()
165
+ if error := token_resp.get("error"):
166
+ raise SourceConnectionError(
167
+ "{} ({})".format(error, token_resp.get("error_description"))
168
+ )
169
+ except Exception as e:
170
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
171
+ raise SourceConnectionError(f"failed to validate connection: {e}")
172
+
173
+ def list_objects_sync(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
174
+ drive_items = folder.children.get().execute_query()
175
+ files = [d for d in drive_items if d.is_file]
176
+ if not recursive:
177
+ return files
178
+
179
+ folders = [d for d in drive_items if d.is_folder]
180
+ for f in folders:
181
+ files.extend(self.list_objects_sync(f, recursive))
182
+ return files
183
+
184
+ async def list_objects(self, folder: "DriveItem", recursive: bool) -> list["DriveItem"]:
185
+ return await asyncio.to_thread(self.list_objects_sync, folder, recursive)
186
+
187
+ def get_root_sync(self, client: "GraphClient") -> "DriveItem":
188
+ root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
189
+ if fpath := self.index_config.path:
190
+ root = root.get_by_path(fpath).get().execute_query()
191
+ if root is None or not root.is_folder:
192
+ raise ValueError(f"Unable to find directory, given: {fpath}")
193
+ return root
194
+
195
+ async def get_root(self, client: "GraphClient") -> "DriveItem":
196
+ return await asyncio.to_thread(self.get_root_sync, client)
197
+
198
+ def get_properties_sync(self, drive_item: "DriveItem") -> dict:
199
+ properties = drive_item.properties
200
+ filtered_properties = {}
201
+ for k, v in properties.items():
202
+ try:
203
+ json.dumps(v)
204
+ filtered_properties[k] = v
205
+ except TypeError:
206
+ pass
207
+ return filtered_properties
208
+
209
+ async def get_properties(self, drive_item: "DriveItem") -> dict:
210
+ return await asyncio.to_thread(self.get_properties_sync, drive_item)
211
+
212
+ def drive_item_to_file_data_sync(self, drive_item: "DriveItem") -> FileData:
213
+ file_path = drive_item.parent_reference.path.split(":")[-1]
214
+ file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path
215
+ filename = drive_item.name
216
+ server_path = file_path + "/" + filename
217
+ rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
218
+ date_modified_dt = (
219
+ parser.parse(str(drive_item.last_modified_datetime))
220
+ if drive_item.last_modified_datetime
221
+ else None
222
+ )
223
+ date_created_at = (
224
+ parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
225
+ )
226
+ return FileData(
227
+ identifier=drive_item.id,
228
+ connector_type=self.connector_type,
229
+ source_identifiers=SourceIdentifiers(
230
+ fullpath=server_path, filename=drive_item.name, rel_path=rel_path
231
+ ),
232
+ metadata=FileDataSourceMetadata(
233
+ url=drive_item.parent_reference.path + "/" + drive_item.name,
234
+ version=drive_item.etag,
235
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
236
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
237
+ date_processed=str(time()),
238
+ record_locator={
239
+ "user_pname": self.connection_config.user_pname,
240
+ "server_relative_path": server_path,
241
+ },
242
+ ),
243
+ additional_metadata=self.get_properties_sync(drive_item=drive_item),
244
+ display_name=server_path,
245
+ )
246
+
247
+ async def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
248
+ # Offload the file data creation if it's not guaranteed async
249
+ return await asyncio.to_thread(self.drive_item_to_file_data_sync, drive_item)
250
+
251
+ def is_async(self) -> bool:
252
+ return True
253
+
254
+ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
255
+ token_resp = await asyncio.to_thread(self.connection_config.get_token)
256
+ if "error" in token_resp:
257
+ raise SourceConnectionError(
258
+ f"[{self.connector_type}]: {token_resp['error']} "
259
+ f"({token_resp.get('error_description')})"
260
+ )
261
+
262
+ client = await asyncio.to_thread(self.connection_config.get_client)
263
+ root = await self.get_root(client=client)
264
+ drive_items = await self.list_objects(folder=root, recursive=self.index_config.recursive)
265
+
266
+ for drive_item in drive_items:
267
+ file_data = await self.drive_item_to_file_data(drive_item=drive_item)
268
+ yield file_data
269
+
270
+
271
+ class OnedriveDownloaderConfig(DownloaderConfig):
272
+ pass
273
+
274
+
275
+ @dataclass
276
+ class OnedriveDownloader(Downloader):
277
+ connection_config: OnedriveConnectionConfig
278
+ download_config: OnedriveDownloaderConfig
279
+ connector_type: str = CONNECTOR_TYPE
280
+
281
+ @SourceConnectionNetworkError.wrap
282
+ def _fetch_file(self, file_data: FileData) -> DriveItem:
283
+ if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
284
+ raise ValueError(
285
+ f"file data doesn't have enough information to get "
286
+ f"file content: {file_data.model_dump()}"
287
+ )
288
+
289
+ server_relative_path = file_data.source_identifiers.fullpath
290
+ client = self.connection_config.get_client()
291
+ root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
292
+ file = root.get_by_path(server_relative_path).get().execute_query()
293
+ if not file:
294
+ raise FileNotFoundError(f"file not found: {server_relative_path}")
295
+ return file
296
+
297
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
298
+ rel_path = file_data.source_identifiers.relative_path
299
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
300
+ return self.download_dir / Path(rel_path)
301
+
302
+ @SourceConnectionError.wrap
303
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
304
+ try:
305
+ file = self._fetch_file(file_data=file_data)
306
+ fsize = file.get_property("size", 0)
307
+ download_path = self.get_download_path(file_data=file_data)
308
+ download_path.parent.mkdir(parents=True, exist_ok=True)
309
+ logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
310
+ if fsize > MAX_BYTES_SIZE:
311
+ logger.info(f"downloading file with size: {fsize} bytes in chunks")
312
+ with download_path.open(mode="wb") as f:
313
+ file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
314
+ else:
315
+ with download_path.open(mode="wb") as f:
316
+ file.download_session(f).execute_query()
317
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
318
+ except Exception as e:
319
+ logger.error(
320
+ f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
321
+ )
322
+ # Re-raise to see full stack trace locally
323
+ raise
324
+
325
+
326
+ class OnedriveUploaderConfig(UploaderConfig):
327
+ remote_url: str = Field(
328
+ description="URL of the destination in OneDrive, e.g., 'onedrive://Documents/Folder'"
329
+ )
330
+ prefix: str = "onedrive://"
331
+
332
+ @property
333
+ def root_folder(self) -> str:
334
+ url = (
335
+ self.remote_url.replace(self.prefix, "", 1)
336
+ if self.remote_url.startswith(self.prefix)
337
+ else self.remote_url
338
+ )
339
+ return url.split("/")[0]
340
+
341
+ @property
342
+ def url(self) -> str:
343
+ url = (
344
+ self.remote_url.replace(self.prefix, "", 1)
345
+ if self.remote_url.startswith(self.prefix)
346
+ else self.remote_url
347
+ )
348
+ return url
349
+
350
+
351
+ @dataclass
352
+ class OnedriveUploader(Uploader):
353
+ connection_config: OnedriveConnectionConfig
354
+ upload_config: OnedriveUploaderConfig
355
+ connector_type: str = CONNECTOR_TYPE
356
+
357
+ @requires_dependencies(["office365"], extras="onedrive")
358
+ def precheck(self) -> None:
359
+ from office365.runtime.client_request_exception import ClientRequestException
360
+
361
+ try:
362
+ token_resp: dict = self.connection_config.get_token()
363
+ if error := token_resp.get("error"):
364
+ raise SourceConnectionError(
365
+ "{} ({})".format(error, token_resp.get("error_description"))
366
+ )
367
+ drive = self.connection_config.get_drive()
368
+ root = drive.root
369
+ root_folder = self.upload_config.root_folder
370
+ folder = root.get_by_path(root_folder)
371
+ try:
372
+ folder.get().execute_query()
373
+ except ClientRequestException as e:
374
+ if not e.response.status_code == 404:
375
+ raise e
376
+ folder = root.create_folder(root_folder).execute_query()
377
+ logger.info(f"successfully created folder: {folder.name}")
378
+ except Exception as e:
379
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
380
+ raise SourceConnectionError(f"failed to validate connection: {e}")
381
+
382
+ @requires_dependencies(["office365"], extras="onedrive")
383
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
384
+ from office365.onedrive.driveitems.conflict_behavior import ConflictBehavior
385
+ from office365.runtime.client_request_exception import ClientRequestException
386
+
387
+ drive = self.connection_config.get_drive()
388
+
389
+ # Use the remote_url from upload_config as the base destination folder
390
+ base_destination_folder = self.upload_config.url
391
+ # Use the file's relative path to maintain directory structure, if needed
392
+ if file_data.source_identifiers and file_data.source_identifiers.relative_path:
393
+ # Combine the base destination folder with the file's relative path
394
+ destination_path = Path(base_destination_folder) / Path(
395
+ f"{file_data.source_identifiers.relative_path}.json"
396
+ )
397
+ else:
398
+ # If no relative path is provided, upload directly to the base destination folder
399
+ destination_path = Path(base_destination_folder) / f"{path.name}.json"
400
+
401
+ destination_folder = destination_path.parent
402
+ file_name = destination_path.name
403
+
404
+ # Convert destination folder to a string suitable for OneDrive API
405
+ destination_folder_str = str(destination_folder).replace("\\", "/")
406
+
407
+ # Resolve the destination folder in OneDrive, creating it if necessary
408
+ try:
409
+ # Attempt to get the folder
410
+ folder = drive.root.get_by_path(destination_folder_str)
411
+ folder.get().execute_query()
412
+ except ClientRequestException as e:
413
+ # Folder doesn't exist, create it recursively
414
+ root = drive.root
415
+ root_folder = self.upload_config.root_folder
416
+ if not e.response.status_code == 404:
417
+ raise e
418
+ folder = root.create_folder(root_folder).execute_query()
419
+ logger.info(f"successfully created folder: {folder.name}")
420
+
421
+ # Check the size of the file
422
+ file_size = path.stat().st_size
423
+
424
+ if file_size < MAX_BYTES_SIZE:
425
+ # Use simple upload for small files
426
+ with path.open("rb") as local_file:
427
+ content = local_file.read()
428
+ logger.info(f"Uploading {path} to {destination_path} using simple upload")
429
+ try:
430
+ uploaded_file = folder.upload(file_name, content).execute_query()
431
+ if not uploaded_file or uploaded_file.name != file_name:
432
+ raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
433
+ # Log details about the uploaded file
434
+ logger.info(
435
+ f"Uploaded file '{uploaded_file.name}' with ID '{uploaded_file.id}'"
436
+ )
437
+ except Exception as e:
438
+ logger.error(f"Failed to upload file '{file_name}': {e}", exc_info=True)
439
+ raise DestinationConnectionError(
440
+ f"Failed to upload file '{file_name}': {e}"
441
+ ) from e
442
+ else:
443
+ # Use resumable upload for large files
444
+ destination_drive_item = drive.root.get_by_path(destination_folder_str)
445
+
446
+ logger.info(
447
+ f"Uploading {path.parent / file_name} to {destination_folder_str} using resumable upload" # noqa: E501
448
+ )
449
+
450
+ try:
451
+ uploaded_file = destination_drive_item.resumable_upload(
452
+ source_path=str(path)
453
+ ).execute_query()
454
+ # Rename the uploaded file to the original source name with a .json extension
455
+ # Overwrite the file if it already exists
456
+ renamed_file = uploaded_file.move(
457
+ name=file_name, conflict_behavior=ConflictBehavior.Replace
458
+ ).execute_query()
459
+ # Validate the upload
460
+ if not renamed_file or renamed_file.name != file_name:
461
+ raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
462
+ # Log details about the uploaded file
463
+ logger.info(f"Uploaded file {renamed_file.name} with ID {renamed_file.id}")
464
+ except Exception as e:
465
+ logger.error(f"Failed to upload file '{file_name}' using resumable upload: {e}")
466
+ raise DestinationConnectionError(
467
+ f"Failed to upload file '{file_name}' using resumable upload: {e}"
468
+ ) from e
469
+
470
+
471
+ onedrive_source_entry = SourceRegistryEntry(
472
+ connection_config=OnedriveConnectionConfig,
473
+ indexer_config=OnedriveIndexerConfig,
474
+ indexer=OnedriveIndexer,
475
+ downloader_config=OnedriveDownloaderConfig,
476
+ downloader=OnedriveDownloader,
477
+ )
478
+
479
+ onedrive_destination_entry = DestinationRegistryEntry(
480
+ connection_config=OnedriveConnectionConfig,
481
+ uploader=OnedriveUploader,
482
+ uploader_config=OnedriveUploaderConfig,
483
+ upload_stager_config=BlobStoreUploadStagerConfig,
484
+ upload_stager=BlobStoreUploadStager,
485
+ )