unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,336 @@
1
+ import hashlib
2
+ import time
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from datetime import timedelta
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Generator, List
8
+
9
+ from pydantic import BaseModel, Field, Secret
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ BatchFileData,
13
+ BatchItem,
14
+ FileData,
15
+ FileDataSourceMetadata,
16
+ SourceIdentifiers,
17
+ )
18
+ from unstructured_ingest.error import (
19
+ DestinationConnectionError,
20
+ SourceConnectionError,
21
+ SourceConnectionNetworkError,
22
+ )
23
+ from unstructured_ingest.interfaces import (
24
+ AccessConfig,
25
+ ConnectionConfig,
26
+ Downloader,
27
+ DownloaderConfig,
28
+ DownloadResponse,
29
+ Indexer,
30
+ IndexerConfig,
31
+ Uploader,
32
+ UploaderConfig,
33
+ UploadStager,
34
+ UploadStagerConfig,
35
+ download_responses,
36
+ )
37
+ from unstructured_ingest.logger import logger
38
+ from unstructured_ingest.processes.connector_registry import (
39
+ DestinationRegistryEntry,
40
+ SourceRegistryEntry,
41
+ )
42
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
43
+ from unstructured_ingest.utils.dep_check import requires_dependencies
44
+
45
+ if TYPE_CHECKING:
46
+ from couchbase.cluster import Cluster
47
+ from couchbase.collection import Collection
48
+
49
+ CONNECTOR_TYPE = "couchbase"
50
+ SERVER_API_VERSION = "1"
51
+
52
+
53
+ class CouchbaseAdditionalMetadata(BaseModel):
54
+ bucket: str
55
+
56
+
57
+ class CouchbaseBatchFileData(BatchFileData):
58
+ additional_metadata: CouchbaseAdditionalMetadata
59
+
60
+
61
+ class CouchbaseAccessConfig(AccessConfig):
62
+ password: str = Field(description="The password for the Couchbase server")
63
+
64
+
65
+ class CouchbaseConnectionConfig(ConnectionConfig):
66
+ username: str = Field(description="The username for the Couchbase server")
67
+ bucket: str = Field(description="The bucket to connect to on the Couchbase server")
68
+ connection_string: str = Field(
69
+ default="couchbase://localhost", description="The connection string of the Couchbase server"
70
+ )
71
+ scope: str = Field(
72
+ default="_default", description="The scope to connect to on the Couchbase server"
73
+ )
74
+ collection: str = Field(
75
+ default="_default", description="The collection to connect to on the Couchbase server"
76
+ )
77
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
78
+ access_config: Secret[CouchbaseAccessConfig]
79
+
80
+ @requires_dependencies(["couchbase"], extras="couchbase")
81
+ @contextmanager
82
+ def get_client(self) -> Generator["Cluster", None, None]:
83
+ from couchbase.auth import PasswordAuthenticator
84
+ from couchbase.cluster import Cluster
85
+ from couchbase.options import ClusterOptions
86
+
87
+ auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
88
+ options = ClusterOptions(auth)
89
+ options.apply_profile("wan_development")
90
+ cluster = None
91
+ try:
92
+ cluster = Cluster(self.connection_string, options)
93
+ cluster.wait_until_ready(timedelta(seconds=5))
94
+ yield cluster
95
+ finally:
96
+ if cluster:
97
+ cluster.close()
98
+
99
+
100
+ class CouchbaseUploadStagerConfig(UploadStagerConfig):
101
+ pass
102
+
103
+
104
+ @dataclass
105
+ class CouchbaseUploadStager(UploadStager):
106
+ upload_stager_config: CouchbaseUploadStagerConfig = field(
107
+ default_factory=lambda: CouchbaseUploadStagerConfig()
108
+ )
109
+
110
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
111
+ data = element_dict.copy()
112
+ return {
113
+ data["element_id"]: {
114
+ "embedding": data.get("embeddings", None),
115
+ "text": data.get("text", None),
116
+ "metadata": data.get("metadata", None),
117
+ "type": data.get("type", None),
118
+ }
119
+ }
120
+
121
+
122
+ class CouchbaseUploaderConfig(UploaderConfig):
123
+ batch_size: int = Field(default=50, description="Number of documents to upload per batch")
124
+
125
+
126
+ @dataclass
127
+ class CouchbaseUploader(Uploader):
128
+ connection_config: CouchbaseConnectionConfig
129
+ upload_config: CouchbaseUploaderConfig
130
+ connector_type: str = CONNECTOR_TYPE
131
+
132
+ def precheck(self) -> None:
133
+ try:
134
+ self.connection_config.get_client()
135
+ except Exception as e:
136
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
137
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
138
+
139
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
140
+ logger.info(
141
+ f"writing {len(data)} objects to destination "
142
+ f"bucket, {self.connection_config.bucket} "
143
+ f"at {self.connection_config.connection_string}",
144
+ )
145
+ with self.connection_config.get_client() as client:
146
+ bucket = client.bucket(self.connection_config.bucket)
147
+ scope = bucket.scope(self.connection_config.scope)
148
+ collection = scope.collection(self.connection_config.collection)
149
+
150
+ for chunk in batch_generator(data, self.upload_config.batch_size):
151
+ collection.upsert_multi(
152
+ {doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
153
+ )
154
+
155
+
156
+ class CouchbaseIndexerConfig(IndexerConfig):
157
+ batch_size: int = Field(default=50, description="Number of documents to index per batch")
158
+
159
+
160
+ @dataclass
161
+ class CouchbaseIndexer(Indexer):
162
+ connection_config: CouchbaseConnectionConfig
163
+ index_config: CouchbaseIndexerConfig
164
+ connector_type: str = CONNECTOR_TYPE
165
+
166
+ def precheck(self) -> None:
167
+ try:
168
+ self.connection_config.get_client()
169
+ except Exception as e:
170
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
171
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
172
+
173
+ @requires_dependencies(["couchbase"], extras="couchbase")
174
+ def _get_doc_ids(self) -> List[str]:
175
+ query = (
176
+ f"SELECT META(d).id "
177
+ f"FROM `{self.connection_config.bucket}`."
178
+ f"`{self.connection_config.scope}`."
179
+ f"`{self.connection_config.collection}` as d"
180
+ )
181
+
182
+ max_attempts = 5
183
+ attempts = 0
184
+ while attempts < max_attempts:
185
+ try:
186
+ with self.connection_config.get_client() as client:
187
+ result = client.query(query)
188
+ document_ids = [row["id"] for row in result]
189
+ return document_ids
190
+ except Exception as e:
191
+ attempts += 1
192
+ time.sleep(3)
193
+ if attempts == max_attempts:
194
+ raise SourceConnectionError(f"failed to get document ids: {e}")
195
+
196
+ def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
197
+ ids = self._get_doc_ids()
198
+ for batch in batch_generator(ids, self.index_config.batch_size):
199
+ # Make sure the hash is always a positive number to create identified
200
+ yield CouchbaseBatchFileData(
201
+ connector_type=CONNECTOR_TYPE,
202
+ metadata=FileDataSourceMetadata(
203
+ url=f"{self.connection_config.connection_string}/"
204
+ f"{self.connection_config.bucket}",
205
+ date_processed=str(time.time()),
206
+ ),
207
+ additional_metadata=CouchbaseAdditionalMetadata(
208
+ bucket=self.connection_config.bucket
209
+ ),
210
+ batch_items=[BatchItem(identifier=b) for b in batch],
211
+ )
212
+
213
+
214
+ class CouchbaseDownloaderConfig(DownloaderConfig):
215
+ collection_id: str = Field(
216
+ default="id", description="The unique key of the id field in the collection"
217
+ )
218
+ fields: list[str] = field(default_factory=list)
219
+
220
+
221
+ @dataclass
222
+ class CouchbaseDownloader(Downloader):
223
+ connection_config: CouchbaseConnectionConfig
224
+ download_config: CouchbaseDownloaderConfig
225
+ connector_type: str = CONNECTOR_TYPE
226
+
227
+ def is_async(self) -> bool:
228
+ return False
229
+
230
+ def get_identifier(self, bucket: str, record_id: str) -> str:
231
+ f = f"{bucket}-{record_id}"
232
+ if self.download_config.fields:
233
+ f = "{}-{}".format(
234
+ f,
235
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
236
+ )
237
+ return f
238
+
239
+ def map_cb_results(self, cb_results: dict) -> str:
240
+ doc_body = cb_results
241
+ flattened_dict = flatten_dict(dictionary=doc_body)
242
+ str_values = [str(value) for value in flattened_dict.values()]
243
+ concatenated_values = "\n".join(str_values)
244
+ return concatenated_values
245
+
246
+ def generate_download_response(
247
+ self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
248
+ ) -> DownloadResponse:
249
+ record_id = result[self.download_config.collection_id]
250
+ filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
251
+ filename = f"{filename_id}.txt"
252
+ download_path = self.download_dir / Path(filename)
253
+ logger.debug(
254
+ f"Downloading results from bucket {bucket} and id {record_id} to {download_path}"
255
+ )
256
+ download_path.parent.mkdir(parents=True, exist_ok=True)
257
+ try:
258
+ with open(download_path, "w", encoding="utf8") as f:
259
+ f.write(self.map_cb_results(cb_results=result))
260
+ except Exception as e:
261
+ logger.error(
262
+ f"failed to download from bucket {bucket} "
263
+ f"and id {record_id} to {download_path}: {e}",
264
+ exc_info=True,
265
+ )
266
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
267
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
268
+ cast_file_data = FileData.cast(file_data=file_data)
269
+ cast_file_data.identifier = filename_id
270
+ cast_file_data.metadata.date_processed = str(time.time())
271
+ cast_file_data.metadata.record_locator = {
272
+ "connection_string": self.connection_config.connection_string,
273
+ "bucket": bucket,
274
+ "scope": self.connection_config.scope,
275
+ "collection": self.connection_config.collection,
276
+ "document_id": record_id,
277
+ }
278
+ return super().generate_download_response(
279
+ file_data=cast_file_data,
280
+ download_path=download_path,
281
+ )
282
+
283
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
284
+ couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
285
+ bucket_name: str = couchbase_file_data.additional_metadata.bucket
286
+ ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
287
+
288
+ with self.connection_config.get_client() as client:
289
+ bucket = client.bucket(bucket_name)
290
+ scope = bucket.scope(self.connection_config.scope)
291
+ collection = scope.collection(self.connection_config.collection)
292
+
293
+ download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
294
+ return list(download_resp)
295
+
296
+ def process_doc_id(
297
+ self,
298
+ doc_id: str,
299
+ collection: "Collection",
300
+ bucket_name: str,
301
+ file_data: CouchbaseBatchFileData,
302
+ ):
303
+ result = collection.get(doc_id)
304
+ return self.generate_download_response(
305
+ result=result.content_as[dict], bucket=bucket_name, file_data=file_data
306
+ )
307
+
308
+ def process_all_doc_ids(
309
+ self,
310
+ ids: list[str],
311
+ collection: "Collection",
312
+ bucket_name: str,
313
+ file_data: CouchbaseBatchFileData,
314
+ ):
315
+ for doc_id in ids:
316
+ yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
317
+
318
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
319
+ raise NotImplementedError()
320
+
321
+
322
+ couchbase_destination_entry = DestinationRegistryEntry(
323
+ connection_config=CouchbaseConnectionConfig,
324
+ uploader=CouchbaseUploader,
325
+ uploader_config=CouchbaseUploaderConfig,
326
+ upload_stager=CouchbaseUploadStager,
327
+ upload_stager_config=CouchbaseUploadStagerConfig,
328
+ )
329
+
330
+ couchbase_source_entry = SourceRegistryEntry(
331
+ connection_config=CouchbaseConnectionConfig,
332
+ indexer=CouchbaseIndexer,
333
+ indexer_config=CouchbaseIndexerConfig,
334
+ downloader=CouchbaseDownloader,
335
+ downloader_config=CouchbaseDownloaderConfig,
336
+ )
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
9
+ from .volumes_aws import (
10
+ databricks_aws_volumes_destination_entry,
11
+ databricks_aws_volumes_source_entry,
12
+ )
13
+ from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
14
+ from .volumes_azure import (
15
+ databricks_azure_volumes_destination_entry,
16
+ databricks_azure_volumes_source_entry,
17
+ )
18
+ from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
19
+ from .volumes_gcp import (
20
+ databricks_gcp_volumes_destination_entry,
21
+ databricks_gcp_volumes_source_entry,
22
+ )
23
+ from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
24
+ from .volumes_native import (
25
+ databricks_native_volumes_destination_entry,
26
+ databricks_native_volumes_source_entry,
27
+ )
28
+ from .volumes_table import CONNECTOR_TYPE as VOLUMES_TABLE_CONNECTOR_TYPE
29
+ from .volumes_table import databricks_volumes_delta_tables_destination_entry
30
+
31
+ add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
32
+ add_destination_entry(
33
+ destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
34
+ )
35
+
36
+ add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
37
+ add_destination_entry(
38
+ destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
39
+ )
40
+
41
+ add_source_entry(
42
+ source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
43
+ )
44
+ add_destination_entry(
45
+ destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
46
+ entry=databricks_native_volumes_destination_entry,
47
+ )
48
+
49
+ add_source_entry(
50
+ source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
51
+ )
52
+ add_destination_entry(
53
+ destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
54
+ )
55
+ add_destination_entry(
56
+ destination_type=VOLUMES_TABLE_CONNECTOR_TYPE,
57
+ entry=databricks_volumes_delta_tables_destination_entry,
58
+ )
@@ -0,0 +1,233 @@
1
+ import io
2
+ import os
3
+ from abc import ABC
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+ from uuid import NAMESPACE_DNS, uuid5
8
+
9
+ from pydantic import BaseModel, Field, Secret
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ FileData,
13
+ FileDataSourceMetadata,
14
+ SourceIdentifiers,
15
+ )
16
+ from unstructured_ingest.error import (
17
+ ProviderError,
18
+ RateLimitError,
19
+ UserAuthError,
20
+ UserError,
21
+ )
22
+ from unstructured_ingest.interfaces import (
23
+ AccessConfig,
24
+ ConnectionConfig,
25
+ Downloader,
26
+ DownloaderConfig,
27
+ DownloadResponse,
28
+ Indexer,
29
+ IndexerConfig,
30
+ Uploader,
31
+ UploaderConfig,
32
+ )
33
+ from unstructured_ingest.logger import logger
34
+ from unstructured_ingest.utils.dep_check import requires_dependencies
35
+
36
+ if TYPE_CHECKING:
37
+ from databricks.sdk import WorkspaceClient
38
+
39
+
40
+ class DatabricksPathMixin(BaseModel):
41
+ volume: str = Field(description="Name of volume in the Unity Catalog")
42
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
43
+ volume_path: Optional[str] = Field(
44
+ default=None, description="Optional path within the volume to write to"
45
+ )
46
+ databricks_schema: str = Field(
47
+ default="default",
48
+ alias="schema",
49
+ description="Schema associated with the volume to write to in the Unity Catalog service",
50
+ )
51
+
52
+ @property
53
+ def path(self) -> str:
54
+ path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
55
+ if self.volume_path:
56
+ path = f"{path}/{self.volume_path}"
57
+ return path
58
+
59
+
60
+ class DatabricksVolumesAccessConfig(AccessConfig):
61
+ token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
62
+
63
+
64
+ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
65
+ access_config: Secret[DatabricksVolumesAccessConfig]
66
+ host: Optional[str] = Field(
67
+ default=None,
68
+ description="The Databricks host URL for either the "
69
+ "Databricks workspace endpoint or the "
70
+ "Databricks accounts endpoint.",
71
+ )
72
+
73
+ def wrap_error(self, e: Exception) -> Exception:
74
+ from databricks.sdk.errors.base import DatabricksError
75
+ from databricks.sdk.errors.platform import STATUS_CODE_MAPPING
76
+
77
+ if isinstance(e, ValueError):
78
+ error_message = e.args[0]
79
+ message_split = error_message.split(":")
80
+ if (message_split[0].endswith("auth")) or (
81
+ "Client authentication failed" in error_message
82
+ ):
83
+ return UserAuthError(e)
84
+ if isinstance(e, DatabricksError):
85
+ reverse_mapping = {v: k for k, v in STATUS_CODE_MAPPING.items()}
86
+ if status_code := reverse_mapping.get(type(e)):
87
+ if status_code in [401, 403]:
88
+ return UserAuthError(e)
89
+ if status_code == 429:
90
+ return RateLimitError(e)
91
+ if 400 <= status_code < 500:
92
+ return UserError(e)
93
+ if 500 <= status_code < 600:
94
+ return ProviderError(e)
95
+ logger.error(f"unhandled exception from databricks: {e}", exc_info=True)
96
+ return e
97
+
98
+ @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
99
+ def get_client(self) -> "WorkspaceClient":
100
+ from databricks.sdk import WorkspaceClient
101
+ from databricks.sdk.core import Config
102
+
103
+ config = Config(
104
+ host=self.host,
105
+ **self.access_config.get_secret_value().model_dump(),
106
+ ).with_user_agent_extra(
107
+ "PyDatabricksSdk", os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
108
+ )
109
+
110
+ return WorkspaceClient(config=config)
111
+
112
+
113
+ class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
114
+ recursive: bool = False
115
+
116
+
117
+ @dataclass
118
+ class DatabricksVolumesIndexer(Indexer, ABC):
119
+ index_config: DatabricksVolumesIndexerConfig
120
+ connection_config: DatabricksVolumesConnectionConfig
121
+
122
+ def precheck(self) -> None:
123
+ try:
124
+ self.connection_config.get_client()
125
+ except Exception as e:
126
+ raise self.connection_config.wrap_error(e=e) from e
127
+
128
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
129
+ try:
130
+ for file_info in self.connection_config.get_client().dbfs.list(
131
+ path=self.index_config.path, recursive=self.index_config.recursive
132
+ ):
133
+ if file_info.is_dir:
134
+ continue
135
+ rel_path = file_info.path.replace(self.index_config.path, "")
136
+ if rel_path.startswith("/"):
137
+ rel_path = rel_path[1:]
138
+ filename = Path(file_info.path).name
139
+ source_identifiers = SourceIdentifiers(
140
+ filename=filename,
141
+ rel_path=rel_path,
142
+ fullpath=file_info.path,
143
+ )
144
+ yield FileData(
145
+ identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
146
+ connector_type=self.connector_type,
147
+ source_identifiers=source_identifiers,
148
+ additional_metadata={
149
+ "catalog": self.index_config.catalog,
150
+ "path": file_info.path,
151
+ },
152
+ metadata=FileDataSourceMetadata(
153
+ url=file_info.path, date_modified=str(file_info.modification_time)
154
+ ),
155
+ display_name=source_identifiers.fullpath,
156
+ )
157
+ except Exception as e:
158
+ raise self.connection_config.wrap_error(e=e)
159
+
160
+
161
+ class DatabricksVolumesDownloaderConfig(DownloaderConfig):
162
+ pass
163
+
164
+
165
+ @dataclass
166
+ class DatabricksVolumesDownloader(Downloader, ABC):
167
+ download_config: DatabricksVolumesDownloaderConfig
168
+ connection_config: DatabricksVolumesConnectionConfig
169
+
170
+ def precheck(self) -> None:
171
+ try:
172
+ self.connection_config.get_client()
173
+ except Exception as e:
174
+ raise self.connection_config.wrap_error(e=e)
175
+
176
+ def get_download_path(self, file_data: FileData) -> Path:
177
+ return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
178
+
179
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
180
+ download_path = self.get_download_path(file_data=file_data)
181
+ download_path.parent.mkdir(parents=True, exist_ok=True)
182
+ volumes_path = file_data.additional_metadata["path"]
183
+ logger.info(f"Writing {file_data.identifier} to {download_path}")
184
+ try:
185
+ with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
186
+ read_content = c._read_handle.read()
187
+ except Exception as e:
188
+ raise self.connection_config.wrap_error(e=e)
189
+ with open(download_path, "wb") as f:
190
+ f.write(read_content)
191
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
192
+
193
+
194
+ class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
195
+ pass
196
+
197
+
198
+ @dataclass
199
+ class DatabricksVolumesUploader(Uploader, ABC):
200
+ upload_config: DatabricksVolumesUploaderConfig
201
+ connection_config: DatabricksVolumesConnectionConfig
202
+
203
+ def get_output_path(self, file_data: FileData) -> str:
204
+ if file_data.source_identifiers.relative_path:
205
+ return os.path.join(
206
+ self.upload_config.path,
207
+ f"{file_data.source_identifiers.relative_path.lstrip('/')}.json",
208
+ )
209
+ else:
210
+ return os.path.join(
211
+ self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
212
+ )
213
+
214
+ def precheck(self) -> None:
215
+ try:
216
+ assert self.connection_config.get_client().current_user.me().active
217
+ except Exception as e:
218
+ raise self.connection_config.wrap_error(e=e)
219
+
220
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
221
+ output_path = self.get_output_path(file_data=file_data)
222
+ with open(path, "rb") as elements_file:
223
+ try:
224
+ # Read file bytes and wrap in BytesIO to create BinaryIO object
225
+ file_bytes = elements_file.read()
226
+ binary_data = io.BytesIO(file_bytes)
227
+ self.connection_config.get_client().files.upload(
228
+ file_path=output_path,
229
+ content=binary_data, # Changed from 'contents' to 'content' in SDK 0.70.0+
230
+ overwrite=True,
231
+ )
232
+ except Exception as e:
233
+ raise self.connection_config.wrap_error(e=e)