unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,59 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-cloud"
16
+
17
+
18
+ class CloudQdrantAccessConfig(QdrantAccessConfig):
19
+ api_key: str = Field(description="Qdrant API key")
20
+
21
+
22
+ class CloudQdrantConnectionConfig(QdrantConnectionConfig):
23
+ url: str = Field(default=None, description="url of Qdrant Cloud")
24
+ access_config: Secret[CloudQdrantAccessConfig]
25
+
26
+ def get_client_kwargs(self) -> dict:
27
+ return {
28
+ "api_key": self.access_config.get_secret_value().api_key,
29
+ "url": self.url,
30
+ }
31
+
32
+
33
+ class CloudQdrantUploadStagerConfig(QdrantUploadStagerConfig):
34
+ pass
35
+
36
+
37
+ @dataclass
38
+ class CloudQdrantUploadStager(QdrantUploadStager):
39
+ upload_stager_config: CloudQdrantUploadStagerConfig
40
+
41
+
42
+ class CloudQdrantUploaderConfig(QdrantUploaderConfig):
43
+ pass
44
+
45
+
46
+ @dataclass
47
+ class CloudQdrantUploader(QdrantUploader):
48
+ connection_config: CloudQdrantConnectionConfig
49
+ upload_config: CloudQdrantUploaderConfig
50
+ connector_type: str = CONNECTOR_TYPE
51
+
52
+
53
+ qdrant_cloud_destination_entry = DestinationRegistryEntry(
54
+ connection_config=CloudQdrantConnectionConfig,
55
+ uploader=CloudQdrantUploader,
56
+ uploader_config=CloudQdrantUploaderConfig,
57
+ upload_stager=CloudQdrantUploadStager,
58
+ upload_stager_config=CloudQdrantUploadStagerConfig,
59
+ )
@@ -0,0 +1,58 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-local"
16
+
17
+
18
+ class LocalQdrantAccessConfig(QdrantAccessConfig):
19
+ pass
20
+
21
+
22
+ class LocalQdrantConnectionConfig(QdrantConnectionConfig):
23
+ path: str = Field(default=None, description="Persistence path for QdrantLocal.")
24
+ access_config: Secret[LocalQdrantAccessConfig] = Field(
25
+ default_factory=LocalQdrantAccessConfig, validate_default=True
26
+ )
27
+
28
+ def get_client_kwargs(self) -> dict:
29
+ return {"path": self.path}
30
+
31
+
32
+ class LocalQdrantUploadStagerConfig(QdrantUploadStagerConfig):
33
+ pass
34
+
35
+
36
+ @dataclass
37
+ class LocalQdrantUploadStager(QdrantUploadStager):
38
+ upload_stager_config: LocalQdrantUploadStagerConfig
39
+
40
+
41
+ class LocalQdrantUploaderConfig(QdrantUploaderConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class LocalQdrantUploader(QdrantUploader):
47
+ connection_config: LocalQdrantConnectionConfig
48
+ upload_config: LocalQdrantUploaderConfig
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ qdrant_local_destination_entry = DestinationRegistryEntry(
53
+ connection_config=LocalQdrantConnectionConfig,
54
+ uploader=LocalQdrantUploader,
55
+ uploader_config=LocalQdrantUploaderConfig,
56
+ upload_stager=LocalQdrantUploadStager,
57
+ upload_stager_config=LocalQdrantUploadStagerConfig,
58
+ )
@@ -0,0 +1,163 @@
1
+ import asyncio
2
+ import json
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import asynccontextmanager, contextmanager
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.data_types.file_data import FileData
11
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
12
+ from unstructured_ingest.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ Uploader,
16
+ UploaderConfig,
17
+ UploadStager,
18
+ UploadStagerConfig,
19
+ )
20
+ from unstructured_ingest.logger import logger
21
+ from unstructured_ingest.utils.data_prep import (
22
+ batch_generator,
23
+ flatten_dict,
24
+ get_enhanced_element_id,
25
+ )
26
+ from unstructured_ingest.utils.dep_check import requires_dependencies
27
+
28
+ if TYPE_CHECKING:
29
+ from qdrant_client import AsyncQdrantClient, QdrantClient
30
+
31
+
32
+ class QdrantAccessConfig(AccessConfig, ABC):
33
+ pass
34
+
35
+
36
+ class QdrantConnectionConfig(ConnectionConfig, ABC):
37
+ access_config: Secret[QdrantAccessConfig] = Field(
38
+ default_factory=QdrantAccessConfig, validate_default=True, description="Access Config"
39
+ )
40
+
41
+ @abstractmethod
42
+ def get_client_kwargs(self) -> dict:
43
+ pass
44
+
45
+ @requires_dependencies(["qdrant_client"], extras="qdrant")
46
+ @asynccontextmanager
47
+ async def get_async_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
48
+ from qdrant_client import AsyncQdrantClient
49
+
50
+ client_kwargs = self.get_client_kwargs()
51
+ client = AsyncQdrantClient(**client_kwargs)
52
+ try:
53
+ yield client
54
+ finally:
55
+ await client.close()
56
+
57
+ @requires_dependencies(["qdrant_client"], extras="qdrant")
58
+ @contextmanager
59
+ def get_client(self) -> Generator["QdrantClient", None, None]:
60
+ from qdrant_client import QdrantClient
61
+
62
+ client_kwargs = self.get_client_kwargs()
63
+ client = QdrantClient(**client_kwargs)
64
+ try:
65
+ yield client
66
+ finally:
67
+ client.close()
68
+
69
+
70
+ class QdrantUploadStagerConfig(UploadStagerConfig):
71
+ pass
72
+
73
+
74
+ @dataclass
75
+ class QdrantUploadStager(UploadStager, ABC):
76
+ upload_stager_config: QdrantUploadStagerConfig = field(
77
+ default_factory=lambda: QdrantUploadStagerConfig()
78
+ )
79
+
80
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
81
+ """Prepares dictionary in the format that Chroma requires"""
82
+ data = element_dict.copy()
83
+ return {
84
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
85
+ "vector": data.pop("embeddings", {}),
86
+ "payload": {
87
+ "text": data.pop("text", None),
88
+ "element_serialized": json.dumps(data),
89
+ **flatten_dict(
90
+ data,
91
+ separator="-",
92
+ flatten_lists=True,
93
+ ),
94
+ },
95
+ }
96
+
97
+
98
+ class QdrantUploaderConfig(UploaderConfig):
99
+ collection_name: str = Field(description="Name of the collection.")
100
+ batch_size: int = Field(default=50, description="Number of records per batch.")
101
+ num_processes: Optional[int] = Field(
102
+ default=1,
103
+ description="Optional limit on number of threads to use for upload.",
104
+ deprecated=True,
105
+ )
106
+
107
+
108
+ @dataclass
109
+ class QdrantUploader(Uploader, ABC):
110
+ upload_config: QdrantUploaderConfig
111
+ connection_config: QdrantConnectionConfig
112
+
113
+ @DestinationConnectionError.wrap
114
+ def precheck(self) -> None:
115
+ with self.connection_config.get_client() as client:
116
+ collections_response = client.get_collections()
117
+ collection_names = [c.name for c in collections_response.collections]
118
+ if self.upload_config.collection_name not in collection_names:
119
+ raise DestinationConnectionError(
120
+ "collection '{}' not found: {}".format(
121
+ self.upload_config.collection_name, ", ".join(collection_names)
122
+ )
123
+ )
124
+
125
+ def is_async(self):
126
+ return True
127
+
128
+ async def run_data_async(
129
+ self,
130
+ data: list[dict],
131
+ file_data: FileData,
132
+ **kwargs: Any,
133
+ ) -> None:
134
+ batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
135
+ logger.debug(
136
+ "Elements split into %i batches of size %i.",
137
+ len(batches),
138
+ self.upload_config.batch_size,
139
+ )
140
+ await asyncio.gather(*[self._upsert_batch(batch) for batch in batches])
141
+
142
+ async def _upsert_batch(self, batch: list[dict]) -> None:
143
+ from qdrant_client import models
144
+
145
+ points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
146
+ try:
147
+ logger.debug(
148
+ "Upserting %i points to the '%s' collection.",
149
+ len(points),
150
+ self.upload_config.collection_name,
151
+ )
152
+ async with self.connection_config.get_async_client() as async_client:
153
+ await async_client.upsert(
154
+ self.upload_config.collection_name, points=points, wait=True
155
+ )
156
+ except Exception as api_error:
157
+ logger.error(
158
+ "Failed to upsert points to the collection due to the following error %s", api_error
159
+ )
160
+
161
+ raise WriteError(f"Qdrant error: {api_error}") from api_error
162
+
163
+ logger.debug("Successfully upsert points to the collection.")
@@ -0,0 +1,60 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-server"
16
+
17
+
18
+ class ServerQdrantAccessConfig(QdrantAccessConfig):
19
+ pass
20
+
21
+
22
+ class ServerQdrantConnectionConfig(QdrantConnectionConfig):
23
+ url: str = Field(default=None, description="url of Qdrant server")
24
+ access_config: Secret[ServerQdrantAccessConfig] = Field(
25
+ default_factory=ServerQdrantAccessConfig, validate_default=True
26
+ )
27
+
28
+ def get_client_kwargs(self) -> dict:
29
+ return {
30
+ "url": self.url,
31
+ }
32
+
33
+
34
+ class ServerQdrantUploadStagerConfig(QdrantUploadStagerConfig):
35
+ pass
36
+
37
+
38
+ @dataclass
39
+ class ServerQdrantUploadStager(QdrantUploadStager):
40
+ upload_stager_config: ServerQdrantUploadStagerConfig
41
+
42
+
43
+ class ServerQdrantUploaderConfig(QdrantUploaderConfig):
44
+ pass
45
+
46
+
47
+ @dataclass
48
+ class ServerQdrantUploader(QdrantUploader):
49
+ connection_config: ServerQdrantConnectionConfig
50
+ upload_config: ServerQdrantUploaderConfig
51
+ connector_type: str = CONNECTOR_TYPE
52
+
53
+
54
+ qdrant_server_destination_entry = DestinationRegistryEntry(
55
+ connection_config=ServerQdrantConnectionConfig,
56
+ uploader=ServerQdrantUploader,
57
+ uploader_config=ServerQdrantUploaderConfig,
58
+ upload_stager=ServerQdrantUploadStager,
59
+ upload_stager_config=ServerQdrantUploadStagerConfig,
60
+ )
@@ -0,0 +1,214 @@
1
+ import json
2
+ from contextlib import asynccontextmanager, contextmanager
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
5
+
6
+ from pydantic import Field, Secret, model_validator
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.error import DestinationConnectionError, ResponseError, ValueError
10
+ from unstructured_ingest.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ Uploader,
14
+ UploaderConfig,
15
+ )
16
+ from unstructured_ingest.logger import logger
17
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
18
+ from unstructured_ingest.utils.data_prep import batch_generator
19
+ from unstructured_ingest.utils.dep_check import requires_dependencies
20
+
21
+ if TYPE_CHECKING:
22
+ from redis.asyncio import Redis
23
+
24
+ import asyncio
25
+
26
+ CONNECTOR_TYPE = "redis"
27
+ SERVER_API_VERSION = "1"
28
+
29
+
30
+ class RedisAccessConfig(AccessConfig):
31
+ uri: Optional[str] = Field(
32
+ default=None, description="If not anonymous, use this uri, if specified."
33
+ )
34
+ password: Optional[str] = Field(
35
+ default=None,
36
+ description="Password used to connect to database if uri is "
37
+ "not specified and connection is not anonymous.",
38
+ )
39
+
40
+
41
+ class RedisConnectionConfig(ConnectionConfig):
42
+ access_config: Secret[RedisAccessConfig] = Field(
43
+ default=RedisAccessConfig(), validate_default=True
44
+ )
45
+ host: Optional[str] = Field(
46
+ default=None,
47
+ description="Hostname or IP address of a Redis instance to connect to "
48
+ "if uri is not specified.",
49
+ )
50
+ database: int = Field(default=0, description="Database index to connect to.")
51
+ port: Optional[int] = Field(
52
+ default=6379, description="Port used to connect to database if uri is not specified."
53
+ )
54
+ username: Optional[str] = Field(
55
+ default=None, description="Username used to connect to database if uri is not specified."
56
+ )
57
+ ssl: Optional[bool] = Field(
58
+ default=True,
59
+ description="Whether the connection should use SSL encryption if uri is not specified.",
60
+ )
61
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
62
+
63
+ @model_validator(mode="after")
64
+ def validate_host_or_url(self) -> "RedisConnectionConfig":
65
+ if not self.access_config.get_secret_value().uri:
66
+ if not self.host:
67
+ raise ValueError("Please pass a hostname either directly or through uri")
68
+ if self.port is None:
69
+ raise ValueError("Since URI is not specified, port cannot be None")
70
+ if self.ssl is None:
71
+ raise ValueError("Since URI is not specified, ssl cannot be None")
72
+ return self
73
+
74
+ @requires_dependencies(["redis"], extras="redis")
75
+ @asynccontextmanager
76
+ async def create_async_client(self) -> AsyncGenerator["Redis", None]:
77
+ from redis.asyncio import Redis, from_url
78
+
79
+ access_config = self.access_config.get_secret_value()
80
+
81
+ if access_config.uri:
82
+ async with from_url(access_config.uri) as client:
83
+ yield client
84
+ else:
85
+ options = {
86
+ "host": self.host,
87
+ "port": self.port,
88
+ "db": self.database,
89
+ "ssl": self.ssl,
90
+ "username": self.username,
91
+ }
92
+
93
+ if access_config.password:
94
+ options["password"] = access_config.password
95
+ async with Redis(**options) as client:
96
+ yield client
97
+
98
+ @requires_dependencies(["redis"], extras="redis")
99
+ @contextmanager
100
+ def create_client(self) -> Generator["Redis", None, None]:
101
+ from redis import Redis, from_url
102
+
103
+ access_config = self.access_config.get_secret_value()
104
+
105
+ options = {
106
+ "host": self.host,
107
+ "port": self.port,
108
+ "db": self.database,
109
+ "ssl": self.ssl,
110
+ "username": self.username,
111
+ }
112
+
113
+ if access_config.password:
114
+ options["password"] = access_config.password
115
+
116
+ if access_config.uri:
117
+ with from_url(access_config.uri) as client:
118
+ yield client
119
+ else:
120
+ with Redis(**options) as client:
121
+ yield client
122
+
123
+
124
+ class RedisUploaderConfig(UploaderConfig):
125
+ batch_size: int = Field(default=100, description="Number of records per batch")
126
+ key_prefix: str = Field(default="", description="Prefix for Redis keys")
127
+
128
+
129
+ def _form_redis_pipeline_error_message(error: str) -> str:
130
+ """
131
+ Form a user-friendly error message for Redis pipeline errors.
132
+ The error message has `$` character at the beginning and `) of pipeline` at the end.
133
+ Everything between these two strings is the value an should be removed.
134
+ """
135
+ start = error.find("$")
136
+ end = error.find(") of pipeline")
137
+ if start != -1 and end != -1:
138
+ return error[: start + 1] + "<value>" + error[end:]
139
+ else:
140
+ return error
141
+
142
+
143
+ @dataclass
144
+ class RedisUploader(Uploader):
145
+ upload_config: RedisUploaderConfig
146
+ connection_config: RedisConnectionConfig
147
+ connector_type: str = CONNECTOR_TYPE
148
+
149
+ def is_async(self) -> bool:
150
+ return True
151
+
152
+ def precheck(self) -> None:
153
+ try:
154
+ with self.connection_config.create_client() as client:
155
+ client.ping()
156
+ except Exception as e:
157
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
158
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
159
+
160
+ async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
161
+ first_element = data[0]
162
+ redis_stack = await self._check_redis_stack(first_element)
163
+ logger.info(
164
+ f"writing {len(data)} objects to destination asynchronously, "
165
+ f"db, {self.connection_config.database}, "
166
+ f"at {self.connection_config.host}",
167
+ )
168
+
169
+ batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
170
+ await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
171
+
172
+ async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
173
+ async with (
174
+ self.connection_config.create_async_client() as async_client,
175
+ async_client.pipeline(transaction=True) as pipe,
176
+ ):
177
+ for element in batch:
178
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
179
+ if redis_stack:
180
+ pipe.json().set(key_with_prefix, "$", element)
181
+ else:
182
+ pipe.set(key_with_prefix, json.dumps(element))
183
+ await pipe.execute()
184
+
185
+ @requires_dependencies(["redis"], extras="redis")
186
+ async def _check_redis_stack(self, element: dict) -> bool:
187
+ from redis import exceptions as redis_exceptions
188
+
189
+ redis_stack = True
190
+ async with (
191
+ self.connection_config.create_async_client() as async_client,
192
+ async_client.pipeline(transaction=True) as pipe,
193
+ ):
194
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
195
+ try:
196
+ # Redis with stack extension supports JSON type
197
+ await pipe.json().set(key_with_prefix, "$", element).execute()
198
+ except redis_exceptions.ResponseError as e:
199
+ message = _form_redis_pipeline_error_message(str(e))
200
+ if "unknown command `JSON.SET`" in message:
201
+ # if this error occurs, Redis server doesn't support JSON type,
202
+ # so save as string type instead
203
+ await pipe.set(key_with_prefix, json.dumps(element)).execute()
204
+ redis_stack = False
205
+ else:
206
+ raise ResponseError(message) from e
207
+ return redis_stack
208
+
209
+
210
+ redis_destination_entry = DestinationRegistryEntry(
211
+ connection_config=RedisConnectionConfig,
212
+ uploader=RedisUploader,
213
+ uploader_config=RedisUploaderConfig,
214
+ )