unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,203 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
11
+ from unstructured_ingest.error import ProviderError, UserAuthError, UserError, ValueError
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.processes.connector_registry import (
14
+ DestinationRegistryEntry,
15
+ SourceRegistryEntry,
16
+ )
17
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
18
+ FsspecAccessConfig,
19
+ FsspecConnectionConfig,
20
+ FsspecDownloader,
21
+ FsspecDownloaderConfig,
22
+ FsspecIndexer,
23
+ FsspecIndexerConfig,
24
+ FsspecUploader,
25
+ FsspecUploaderConfig,
26
+ )
27
+ from unstructured_ingest.processes.connectors.fsspec.utils import json_serial, sterilize_dict
28
+ from unstructured_ingest.processes.utils.blob_storage import (
29
+ BlobStoreUploadStager,
30
+ BlobStoreUploadStagerConfig,
31
+ )
32
+ from unstructured_ingest.utils.dep_check import requires_dependencies
33
+
34
+ if TYPE_CHECKING:
35
+ from adlfs import AzureBlobFileSystem
36
+
37
+ CONNECTOR_TYPE = "azure"
38
+
39
+
40
+ def azure_json_serial(obj):
41
+ from azure.storage.blob._models import ContentSettings
42
+
43
+ if isinstance(obj, ContentSettings):
44
+ return dict(obj)
45
+ if isinstance(obj, bytearray):
46
+ return str(obj)
47
+ return json_serial(obj)
48
+
49
+
50
+ class AzureIndexerConfig(FsspecIndexerConfig):
51
+ pass
52
+
53
+
54
+ class AzureAccessConfig(FsspecAccessConfig):
55
+ account_name: Optional[str] = Field(
56
+ default=None,
57
+ description="The storage account name. This is used to authenticate "
58
+ "requests signed with an account key and to construct "
59
+ "the storage endpoint. It is required unless a connection "
60
+ "string is given, or if a custom domain is used with "
61
+ "anonymous authentication.",
62
+ )
63
+ account_key: Optional[str] = Field(
64
+ default=None,
65
+ description="The storage account key. This is used for shared key "
66
+ "authentication. If any of account key, sas token or "
67
+ "client_id are not specified, anonymous access will be used.",
68
+ )
69
+ connection_string: Optional[str] = Field(
70
+ default=None,
71
+ description="If specified, this will override all other parameters. See "
72
+ "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
73
+ "for the connection string format.",
74
+ )
75
+ sas_token: Optional[str] = Field(
76
+ default=None,
77
+ description="A shared access signature token to use to authenticate "
78
+ "requests instead of the account key. If account key and "
79
+ "sas token are both specified, account key will be used "
80
+ "to sign. If any of account key, sas token or client_id "
81
+ "are not specified, anonymous access will be used.",
82
+ )
83
+
84
+ def model_post_init(self, __context: Any) -> None:
85
+ if self.connection_string is None and self.account_name is None:
86
+ raise ValueError("either connection_string or account_name must be set")
87
+
88
+
89
+ class AzureConnectionConfig(FsspecConnectionConfig):
90
+ supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
91
+ access_config: Secret[AzureAccessConfig]
92
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
93
+
94
+ def get_access_config(self) -> dict[str, Any]:
95
+ # Avoid injecting None by filtering out k,v pairs where the value is None
96
+ access_configs: dict[str, Any] = {
97
+ k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
98
+ }
99
+ return access_configs
100
+
101
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
102
+ @contextmanager
103
+ def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
104
+ with super().get_client(protocol=protocol) as client:
105
+ yield client
106
+
107
+ def wrap_error(self, e: Exception) -> Exception:
108
+ from azure.core.exceptions import ClientAuthenticationError, HttpResponseError
109
+
110
+ if not isinstance(e, HttpResponseError):
111
+ logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
112
+ return e
113
+ if isinstance(e, ClientAuthenticationError):
114
+ return UserAuthError(e.reason)
115
+ status_code = e.status_code
116
+ message = e.reason
117
+ if status_code is not None:
118
+ if 400 <= status_code < 500:
119
+ return UserError(message)
120
+ if status_code >= 500:
121
+ return ProviderError(message)
122
+ logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
123
+ return e
124
+
125
+
126
+ @dataclass
127
+ class AzureIndexer(FsspecIndexer):
128
+ connection_config: AzureConnectionConfig
129
+ index_config: AzureIndexerConfig
130
+ connector_type: str = CONNECTOR_TYPE
131
+
132
+ def sterilize_info(self, file_data: dict) -> dict:
133
+ return sterilize_dict(data=file_data, default=azure_json_serial)
134
+
135
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
136
+ path = file_info["name"]
137
+ date_created = (
138
+ str(file_info.get("creation_time").timestamp())
139
+ if "creation_time" in file_info
140
+ else None
141
+ )
142
+ date_modified = (
143
+ str(file_info.get("last_modified").timestamp())
144
+ if "last_modified" in file_info
145
+ else None
146
+ )
147
+
148
+ file_size = file_info.get("size") if "size" in file_info else None
149
+
150
+ version = file_info.get("etag")
151
+ record_locator = {
152
+ "protocol": self.index_config.protocol,
153
+ "remote_file_path": self.index_config.remote_url,
154
+ }
155
+ return FileDataSourceMetadata(
156
+ date_created=date_created,
157
+ date_modified=date_modified,
158
+ date_processed=str(time()),
159
+ version=version,
160
+ url=f"{self.index_config.protocol}://{path}",
161
+ record_locator=record_locator,
162
+ filesize_bytes=file_size,
163
+ )
164
+
165
+
166
+ class AzureDownloaderConfig(FsspecDownloaderConfig):
167
+ pass
168
+
169
+
170
+ @dataclass
171
+ class AzureDownloader(FsspecDownloader):
172
+ protocol: str = "az"
173
+ connection_config: AzureConnectionConfig
174
+ connector_type: str = CONNECTOR_TYPE
175
+ download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
176
+
177
+
178
+ class AzureUploaderConfig(FsspecUploaderConfig):
179
+ pass
180
+
181
+
182
+ @dataclass
183
+ class AzureUploader(FsspecUploader):
184
+ connector_type: str = CONNECTOR_TYPE
185
+ connection_config: AzureConnectionConfig
186
+ upload_config: AzureUploaderConfig = field(default=None)
187
+
188
+
189
+ azure_source_entry = SourceRegistryEntry(
190
+ indexer=AzureIndexer,
191
+ indexer_config=AzureIndexerConfig,
192
+ downloader=AzureDownloader,
193
+ downloader_config=AzureDownloaderConfig,
194
+ connection_config=AzureConnectionConfig,
195
+ )
196
+
197
+ azure_destination_entry = DestinationRegistryEntry(
198
+ uploader=AzureUploader,
199
+ uploader_config=AzureUploaderConfig,
200
+ connection_config=AzureConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
203
+ )
@@ -0,0 +1,176 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
7
+
8
+ from dateutil import parser
9
+ from pydantic import Field, Secret
10
+ from pydantic.functional_validators import BeforeValidator
11
+
12
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
13
+ from unstructured_ingest.error import ProviderError, UserAuthError, UserError
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.processes.connector_registry import (
16
+ DestinationRegistryEntry,
17
+ SourceRegistryEntry,
18
+ )
19
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
20
+ FsspecAccessConfig,
21
+ FsspecConnectionConfig,
22
+ FsspecDownloader,
23
+ FsspecDownloaderConfig,
24
+ FsspecIndexer,
25
+ FsspecIndexerConfig,
26
+ FsspecUploader,
27
+ FsspecUploaderConfig,
28
+ )
29
+ from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
30
+ from unstructured_ingest.processes.utils.blob_storage import (
31
+ BlobStoreUploadStager,
32
+ BlobStoreUploadStagerConfig,
33
+ )
34
+ from unstructured_ingest.utils.dep_check import requires_dependencies
35
+
36
+ if TYPE_CHECKING:
37
+ from boxfs import BoxFileSystem
38
+
39
+ CONNECTOR_TYPE = "box"
40
+
41
+
42
+ class BoxIndexerConfig(FsspecIndexerConfig):
43
+ pass
44
+
45
+
46
+ class BoxAccessConfig(FsspecAccessConfig):
47
+ box_app_config: Annotated[dict, BeforeValidator(conform_string_to_dict)] = Field(
48
+ description="Box app credentials as a JSON string."
49
+ )
50
+
51
+
52
+ class BoxConnectionConfig(FsspecConnectionConfig):
53
+ supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
54
+ access_config: Secret[BoxAccessConfig]
55
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
56
+
57
+ def get_access_config(self) -> dict[str, Any]:
58
+ from boxsdk import JWTAuth
59
+
60
+ ac = self.access_config.get_secret_value()
61
+ settings_dict = ac.box_app_config
62
+
63
+ # Create and authenticate the JWTAuth object
64
+ oauth = JWTAuth.from_settings_dictionary(settings_dict)
65
+ oauth.authenticate_instance()
66
+
67
+ # if not oauth.access_token:
68
+ # raise SourceConnectionError("Authentication failed: No access token generated.")
69
+
70
+ # Prepare the access configuration with the authenticated oauth
71
+ access_kwargs_with_oauth: dict[str, Any] = {
72
+ "oauth": oauth,
73
+ }
74
+ access_config: dict[str, Any] = ac.model_dump()
75
+ access_config.pop("box_app_config", None)
76
+ access_kwargs_with_oauth.update(access_config)
77
+
78
+ return access_kwargs_with_oauth
79
+
80
+ def wrap_error(self, e: Exception) -> Exception:
81
+ from boxsdk.exception import BoxAPIException, BoxOAuthException
82
+
83
+ if isinstance(e, BoxOAuthException):
84
+ return UserAuthError(e.message)
85
+ if not isinstance(e, BoxAPIException):
86
+ logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
87
+ return e
88
+ message = e.message or e
89
+ if error_code_status := e.status:
90
+ if 400 <= error_code_status < 500:
91
+ return UserError(message)
92
+ if error_code_status >= 500:
93
+ return ProviderError(message)
94
+
95
+ logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
96
+ return e
97
+
98
+ @requires_dependencies(["boxfs"], extras="box")
99
+ @contextmanager
100
+ def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
101
+ with super().get_client(protocol=protocol) as client:
102
+ yield client
103
+
104
+
105
+ @dataclass
106
+ class BoxIndexer(FsspecIndexer):
107
+ connection_config: BoxConnectionConfig
108
+ index_config: BoxIndexerConfig
109
+ connector_type: str = CONNECTOR_TYPE
110
+
111
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
112
+ path = file_info["name"]
113
+ date_created = None
114
+ date_modified = None
115
+ if modified_at_str := file_info.get("modified_at"):
116
+ date_modified = str(parser.parse(modified_at_str).timestamp())
117
+ if created_at_str := file_info.get("created_at"):
118
+ date_created = str(parser.parse(created_at_str).timestamp())
119
+
120
+ file_size = file_info.get("size") if "size" in file_info else None
121
+
122
+ version = file_info.get("id")
123
+ record_locator = {
124
+ "protocol": self.index_config.protocol,
125
+ "remote_file_path": self.index_config.remote_url,
126
+ "file_id": file_info.get("id"),
127
+ }
128
+ return FileDataSourceMetadata(
129
+ date_created=date_created,
130
+ date_modified=date_modified,
131
+ date_processed=str(time()),
132
+ version=version,
133
+ url=f"{self.index_config.protocol}://{path}",
134
+ record_locator=record_locator,
135
+ filesize_bytes=file_size,
136
+ )
137
+
138
+
139
+ class BoxDownloaderConfig(FsspecDownloaderConfig):
140
+ pass
141
+
142
+
143
+ @dataclass
144
+ class BoxDownloader(FsspecDownloader):
145
+ protocol: str = "box"
146
+ connection_config: BoxConnectionConfig
147
+ connector_type: str = CONNECTOR_TYPE
148
+ download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
149
+
150
+
151
+ class BoxUploaderConfig(FsspecUploaderConfig):
152
+ pass
153
+
154
+
155
+ @dataclass
156
+ class BoxUploader(FsspecUploader):
157
+ connector_type: str = CONNECTOR_TYPE
158
+ connection_config: BoxConnectionConfig
159
+ upload_config: BoxUploaderConfig = field(default=None)
160
+
161
+
162
+ box_source_entry = SourceRegistryEntry(
163
+ indexer=BoxIndexer,
164
+ indexer_config=BoxIndexerConfig,
165
+ downloader=BoxDownloader,
166
+ downloader_config=BoxDownloaderConfig,
167
+ connection_config=BoxConnectionConfig,
168
+ )
169
+
170
+ box_destination_entry = DestinationRegistryEntry(
171
+ uploader=BoxUploader,
172
+ uploader_config=BoxUploaderConfig,
173
+ connection_config=BoxConnectionConfig,
174
+ upload_stager_config=BlobStoreUploadStagerConfig,
175
+ upload_stager=BlobStoreUploadStager,
176
+ )
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from time import time
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
10
+ from unstructured_ingest.error import (
11
+ ProviderError,
12
+ UserAuthError,
13
+ UserError,
14
+ ValueError,
15
+ )
16
+ from unstructured_ingest.error import (
17
+ RateLimitError as CustomRateLimitError,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.processes.connector_registry import (
21
+ DestinationRegistryEntry,
22
+ SourceRegistryEntry,
23
+ )
24
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
25
+ FsspecAccessConfig,
26
+ FsspecConnectionConfig,
27
+ FsspecDownloader,
28
+ FsspecDownloaderConfig,
29
+ FsspecIndexer,
30
+ FsspecIndexerConfig,
31
+ FsspecUploader,
32
+ FsspecUploaderConfig,
33
+ )
34
+ from unstructured_ingest.processes.utils.blob_storage import (
35
+ BlobStoreUploadStager,
36
+ BlobStoreUploadStagerConfig,
37
+ )
38
+ from unstructured_ingest.utils.dep_check import requires_dependencies
39
+
40
+ if TYPE_CHECKING:
41
+ pass
42
+
43
+ CONNECTOR_TYPE = "dropbox"
44
+
45
+
46
+ class DropboxIndexerConfig(FsspecIndexerConfig):
47
+ def model_post_init(self, __context):
48
+ if not self.path_without_protocol.startswith("/"):
49
+ self.path_without_protocol = "/" + self.path_without_protocol
50
+
51
+
52
+ class DropboxAccessConfig(FsspecAccessConfig):
53
+ token: Optional[str] = Field(
54
+ default=None, description="Dropbox access token."
55
+ ) # This is the short lived (4h) token that needs to be generated anew each time.
56
+ app_key: Optional[str] = Field(default=None, description="Dropbox app key.")
57
+ app_secret: Optional[str] = Field(default=None, description="Dropbox app secret.")
58
+ refresh_token: Optional[str] = Field(
59
+ default=None, description="Dropbox refresh token."
60
+ ) # This is the long lived token that doesn't expire
61
+
62
+
63
+ class DropboxConnectionConfig(FsspecConnectionConfig):
64
+ access_config: Secret[DropboxAccessConfig] = Field(
65
+ default=DropboxAccessConfig(), validate_default=True
66
+ )
67
+ connector_type: str = Field(default=CONNECTOR_TYPE)
68
+
69
+ @requires_dependencies(["dropbox"])
70
+ def get_dropbox_access_token_from_refresh(
71
+ self,
72
+ refresh_token: str,
73
+ app_key: str,
74
+ app_secret: str,
75
+ ) -> str:
76
+ """
77
+ Uses the Dropbox Python SDK to exchange a long-lived refresh token for an access token.
78
+ """
79
+ import dropbox
80
+
81
+ dbx = dropbox.Dropbox(
82
+ oauth2_access_token=None,
83
+ oauth2_refresh_token=refresh_token,
84
+ app_key=app_key,
85
+ app_secret=app_secret,
86
+ )
87
+
88
+ # This call fetches a new short-lived token and auto-updates dbx._oauth2_access_token
89
+ dbx.check_and_refresh_access_token()
90
+ short_lived_token = dbx._oauth2_access_token # Private attr, but standard usage
91
+ return short_lived_token
92
+
93
+ def get_access_config(self) -> dict[str, Any]:
94
+ """
95
+ Overrides the parent FsspecConnectionConfig.get_access_config() to ensure
96
+ that we always provide an access token if refresh credentials exist.
97
+ """
98
+ base_conf = super().get_access_config()
99
+
100
+ refresh_token = base_conf.get("refresh_token")
101
+ app_key = base_conf.get("app_key")
102
+ app_secret = base_conf.get("app_secret")
103
+
104
+ # Standard scenario - we have refresh a token and creds provided
105
+ # which we're going to use to retrieve access token
106
+ if refresh_token and app_key and app_secret:
107
+ logger.debug("Attempting to generate access token from refresh token...")
108
+ new_token = self.get_dropbox_access_token_from_refresh(
109
+ refresh_token=refresh_token,
110
+ app_key=app_key,
111
+ app_secret=app_secret,
112
+ )
113
+ if not new_token:
114
+ raise ValueError(
115
+ "Unable to retrieve an access token from Dropbox. "
116
+ "Please check that your refresh token, app key, and secret are valid."
117
+ )
118
+ base_conf["token"] = new_token
119
+ elif not base_conf.get("token"): # we might already have an access token from outside
120
+ # We have neither an existing short?lived token nor refresh credentials
121
+ raise ValueError(
122
+ "No valid token or refresh_token with app credentials was found. "
123
+ "Please check that your refresh token, app key, and secret are valid "
124
+ "or provide a valid short-lived token"
125
+ )
126
+
127
+ return base_conf
128
+
129
+ @requires_dependencies(["dropbox"])
130
+ def wrap_error(self, e: Exception) -> Exception:
131
+ from dropbox.exceptions import AuthError, HttpError, RateLimitError
132
+
133
+ if not isinstance(e, HttpError):
134
+ logger.error(f"Unhandled Dropbox exception: {repr(e)}", exc_info=True)
135
+ return e
136
+
137
+ if isinstance(e, AuthError):
138
+ raise UserAuthError(e.error)
139
+ elif isinstance(e, RateLimitError):
140
+ return CustomRateLimitError(e.error)
141
+
142
+ status_code = e.status_code
143
+ if 400 <= status_code < 500:
144
+ if body := getattr(e, "body", None):
145
+ return UserError(body)
146
+ else:
147
+ return UserError(e.body)
148
+ if status_code >= 500:
149
+ if body := getattr(e, "body", None):
150
+ return ProviderError(body)
151
+ else:
152
+ return ProviderError(e.body)
153
+
154
+ logger.error(f"Unhandled Dropbox HttpError: {repr(e)}", exc_info=True)
155
+ return e
156
+
157
+
158
+ @dataclass
159
+ class DropboxIndexer(FsspecIndexer):
160
+ connection_config: DropboxConnectionConfig
161
+ index_config: DropboxIndexerConfig
162
+ connector_type: str = CONNECTOR_TYPE
163
+
164
+ def get_path(self, file_info: dict) -> str:
165
+ return file_info["name"]
166
+
167
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
168
+ path = file_info["name"].lstrip("/")
169
+ date_created = None
170
+ date_modified = None
171
+ server_modified = file_info.get("server_modified")
172
+ client_modified = file_info.get("client_modified")
173
+ if server_modified and client_modified and server_modified > client_modified:
174
+ date_created = str(client_modified.timestamp())
175
+ date_modified = str(server_modified.timestamp())
176
+ elif server_modified and client_modified and server_modified < client_modified:
177
+ date_created = str(server_modified.timestamp())
178
+ date_modified = str(client_modified.timestamp())
179
+
180
+ file_size = file_info.get("size") if "size" in file_info else None
181
+
182
+ version = file_info.get("content_hash")
183
+ record_locator = {
184
+ "protocol": self.index_config.protocol,
185
+ "remote_file_path": self.index_config.remote_url,
186
+ "file_id": file_info.get("id"),
187
+ }
188
+ return FileDataSourceMetadata(
189
+ date_created=date_created,
190
+ date_modified=date_modified,
191
+ date_processed=str(time()),
192
+ version=version,
193
+ url=f"{self.index_config.protocol}://{path}",
194
+ record_locator=record_locator,
195
+ filesize_bytes=file_size,
196
+ )
197
+
198
+
199
+ class DropboxDownloaderConfig(FsspecDownloaderConfig):
200
+ pass
201
+
202
+
203
+ @dataclass
204
+ class DropboxDownloader(FsspecDownloader):
205
+ protocol: str = "dropbox"
206
+ connection_config: DropboxConnectionConfig
207
+ connector_type: str = CONNECTOR_TYPE
208
+ download_config: Optional[DropboxDownloaderConfig] = field(
209
+ default_factory=DropboxDownloaderConfig
210
+ )
211
+
212
+
213
+ class DropboxUploaderConfig(FsspecUploaderConfig):
214
+ pass
215
+
216
+
217
+ @dataclass
218
+ class DropboxUploader(FsspecUploader):
219
+ connector_type: str = CONNECTOR_TYPE
220
+ connection_config: DropboxConnectionConfig
221
+ upload_config: DropboxUploaderConfig = field(default=None)
222
+
223
+
224
+ dropbox_source_entry = SourceRegistryEntry(
225
+ indexer=DropboxIndexer,
226
+ indexer_config=DropboxIndexerConfig,
227
+ downloader=DropboxDownloader,
228
+ downloader_config=DropboxDownloaderConfig,
229
+ connection_config=DropboxConnectionConfig,
230
+ )
231
+
232
+ dropbox_destination_entry = DestinationRegistryEntry(
233
+ uploader=DropboxUploader,
234
+ uploader_config=DropboxUploaderConfig,
235
+ connection_config=DropboxConnectionConfig,
236
+ upload_stager_config=BlobStoreUploadStagerConfig,
237
+ upload_stager=BlobStoreUploadStager,
238
+ )