unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,275 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Any, Generator
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.error import DestinationConnectionError, ValueError, WriteError
10
+ from unstructured_ingest.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ Uploader,
14
+ UploaderConfig,
15
+ UploadStager,
16
+ UploadStagerConfig,
17
+ )
18
+ from unstructured_ingest.logger import logger
19
+ from unstructured_ingest.processes.connector_registry import (
20
+ DestinationRegistryEntry,
21
+ )
22
+ from unstructured_ingest.processes.connectors.utils import parse_datetime
23
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
24
+ from unstructured_ingest.utils.data_prep import batch_generator, get_enhanced_element_id
25
+ from unstructured_ingest.utils.dep_check import requires_dependencies
26
+
27
+ if TYPE_CHECKING:
28
+ from azure.search.documents import SearchClient
29
+ from azure.search.documents.indexes import SearchIndexClient
30
+
31
+ CONNECTOR_TYPE = "azure_ai_search"
32
+
33
+
34
+ class AzureAISearchAccessConfig(AccessConfig):
35
+ azure_ai_search_key: str = Field(
36
+ alias="key", description="Credential that is used for authenticating to an Azure service"
37
+ )
38
+
39
+
40
+ class AzureAISearchConnectionConfig(ConnectionConfig):
41
+ endpoint: str = Field(
42
+ description="The URL endpoint of an Azure AI (Cognitive) search service. "
43
+ "In the form of https://{{service_name}}.search.windows.net"
44
+ )
45
+ index: str = Field(
46
+ description="The name of the Azure AI (Cognitive) Search index to connect to."
47
+ )
48
+ access_config: Secret[AzureAISearchAccessConfig]
49
+
50
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
51
+ @contextmanager
52
+ def get_search_client(self) -> Generator["SearchClient", None, None]:
53
+ from azure.core.credentials import AzureKeyCredential
54
+ from azure.search.documents import SearchClient
55
+
56
+ with SearchClient(
57
+ endpoint=self.endpoint,
58
+ index_name=self.index,
59
+ credential=AzureKeyCredential(
60
+ self.access_config.get_secret_value().azure_ai_search_key
61
+ ),
62
+ ) as client:
63
+ yield client
64
+
65
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
66
+ @contextmanager
67
+ def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
68
+ from azure.core.credentials import AzureKeyCredential
69
+ from azure.search.documents.indexes import SearchIndexClient
70
+
71
+ with SearchIndexClient(
72
+ endpoint=self.endpoint,
73
+ credential=AzureKeyCredential(
74
+ self.access_config.get_secret_value().azure_ai_search_key
75
+ ),
76
+ ) as search_index_client:
77
+ yield search_index_client
78
+
79
+
80
+ class AzureAISearchUploadStagerConfig(UploadStagerConfig):
81
+ pass
82
+
83
+
84
+ class AzureAISearchUploaderConfig(UploaderConfig):
85
+ batch_size: int = Field(default=100, description="Number of records per batch")
86
+ record_id_key: str = Field(
87
+ default=RECORD_ID_LABEL,
88
+ description="searchable key to find entries for the same record on previous runs",
89
+ )
90
+
91
+
92
+ @dataclass
93
+ class AzureAISearchUploadStager(UploadStager):
94
+ upload_stager_config: AzureAISearchUploadStagerConfig = field(
95
+ default_factory=lambda: AzureAISearchUploadStagerConfig()
96
+ )
97
+
98
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
99
+ """
100
+ updates the dictionary that is from each Element being converted into a dict/json
101
+ into a dictionary that conforms to the schema expected by the
102
+ Azure Cognitive Search index
103
+ """
104
+ data = element_dict.copy()
105
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
106
+ data[RECORD_ID_LABEL] = file_data.identifier
107
+
108
+ if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
109
+ data["metadata"]["coordinates"]["points"] = json.dumps(points)
110
+ if version := data.get("metadata", {}).get("data_source", {}).get("version"):
111
+ data["metadata"]["data_source"]["version"] = str(version)
112
+ if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
113
+ data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
114
+ if permissions_data := (
115
+ data.get("metadata", {}).get("data_source", {}).get("permissions_data")
116
+ ):
117
+ data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
118
+ if links := data.get("metadata", {}).get("links"):
119
+ data["metadata"]["links"] = [json.dumps(link) for link in links]
120
+ if last_modified := data.get("metadata", {}).get("last_modified"):
121
+ data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime(
122
+ "%Y-%m-%dT%H:%M:%S.%fZ"
123
+ )
124
+ if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
125
+ data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime(
126
+ "%Y-%m-%dT%H:%M:%S.%fZ"
127
+ )
128
+
129
+ if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
130
+ data["metadata"]["data_source"]["date_modified"] = parse_datetime(
131
+ date_modified
132
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
133
+
134
+ if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
135
+ data["metadata"]["data_source"]["date_processed"] = parse_datetime(
136
+ date_processed
137
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
138
+
139
+ if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
140
+ data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
141
+ if page_number := data.get("metadata", {}).get("page_number"):
142
+ data["metadata"]["page_number"] = str(page_number)
143
+ return data
144
+
145
+
146
+ @dataclass
147
+ class AzureAISearchUploader(Uploader):
148
+ upload_config: AzureAISearchUploaderConfig
149
+ connection_config: AzureAISearchConnectionConfig
150
+ connector_type: str = CONNECTOR_TYPE
151
+
152
+ def query_docs(self, record_id: str, index_key: str) -> list[str]:
153
+ with self.connection_config.get_search_client() as search_client:
154
+ results = list(
155
+ search_client.search(filter=f"record_id eq '{record_id}'", select=[index_key])
156
+ )
157
+ return [result[index_key] for result in results]
158
+
159
+ def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
160
+ logger.debug(
161
+ f"deleting any content with metadata "
162
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
163
+ f"from azure cognitive search index: {self.connection_config.index}"
164
+ )
165
+ doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
166
+ if not doc_ids_to_delete:
167
+ return
168
+ with self.connection_config.get_search_client() as search_client:
169
+ results = search_client.delete_documents(
170
+ documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
171
+ )
172
+ errors = []
173
+ success = []
174
+ for result in results:
175
+ if result.succeeded:
176
+ success.append(result)
177
+ else:
178
+ errors.append(result)
179
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
180
+ if errors:
181
+ raise WriteError(
182
+ ", ".join(
183
+ [f"[{error.status_code}] {error.error_message}" for error in errors],
184
+ ),
185
+ )
186
+
187
+ @DestinationConnectionError.wrap
188
+ @requires_dependencies(["azure"], extras="azure-ai-search")
189
+ def write_dict(
190
+ self, elements_dict: list[dict[str, Any]], search_client: "SearchClient"
191
+ ) -> None:
192
+ import azure.core.exceptions
193
+
194
+ logger.info(
195
+ f"writing {len(elements_dict)} documents to destination "
196
+ f"index at {self.connection_config.index}",
197
+ )
198
+ try:
199
+ results = search_client.upload_documents(documents=elements_dict)
200
+ except azure.core.exceptions.HttpResponseError as http_error:
201
+ raise WriteError(f"http error: {http_error}") from http_error
202
+
203
+ errors = []
204
+ success = []
205
+ for result in results:
206
+ if result.succeeded:
207
+ success.append(result)
208
+ else:
209
+ errors.append(result)
210
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
211
+ if errors:
212
+ raise WriteError(
213
+ ", ".join(
214
+ [
215
+ f"{error.key}: [{error.status_code}] {error.error_message}"
216
+ for error in errors
217
+ ],
218
+ ),
219
+ )
220
+
221
+ def can_delete(self) -> bool:
222
+ with self.connection_config.get_search_index_client() as search_index_client:
223
+ index = search_index_client.get_index(name=self.connection_config.index)
224
+ index_fields = index.fields
225
+ record_id_fields = [
226
+ field for field in index_fields if field.name == self.upload_config.record_id_key
227
+ ]
228
+ if not record_id_fields:
229
+ return False
230
+ record_id_field = record_id_fields[0]
231
+ return record_id_field.filterable
232
+
233
+ def get_index_key(self) -> str:
234
+ with self.connection_config.get_search_index_client() as search_index_client:
235
+ index = search_index_client.get_index(name=self.connection_config.index)
236
+ index_fields = index.fields
237
+ key_fields = [field for field in index_fields if field.key]
238
+ if not key_fields:
239
+ raise ValueError("no key field found in index fields")
240
+ return key_fields[0].name
241
+
242
+ def precheck(self) -> None:
243
+ try:
244
+ with self.connection_config.get_search_client() as search_client:
245
+ search_client.get_document_count()
246
+ except Exception as e:
247
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
248
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
249
+
250
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
251
+ logger.info(
252
+ f"writing document batches to destination"
253
+ f" endpoint at {str(self.connection_config.endpoint)}"
254
+ f" index at {str(self.connection_config.index)}"
255
+ f" with batch size {str(self.upload_config.batch_size)}"
256
+ )
257
+ if self.can_delete():
258
+ index_key = self.get_index_key()
259
+ self.delete_by_record_id(file_data=file_data, index_key=index_key)
260
+ else:
261
+ logger.warning("criteria for deleting previous content not met, skipping")
262
+
263
+ batch_size = self.upload_config.batch_size
264
+ with self.connection_config.get_search_client() as search_client:
265
+ for chunk in batch_generator(data, batch_size):
266
+ self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
267
+
268
+
269
+ azure_ai_search_destination_entry = DestinationRegistryEntry(
270
+ connection_config=AzureAISearchConnectionConfig,
271
+ uploader=AzureAISearchUploader,
272
+ uploader_config=AzureAISearchUploaderConfig,
273
+ upload_stager=AzureAISearchUploadStager,
274
+ upload_stager_config=AzureAISearchUploadStagerConfig,
275
+ )
@@ -0,0 +1,193 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import date, datetime
3
+ from typing import TYPE_CHECKING, Annotated, Any, Optional
4
+
5
+ from dateutil import parser
6
+ from pydantic import Field, Secret
7
+ from pydantic.functional_validators import BeforeValidator
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData
10
+ from unstructured_ingest.error import DestinationConnectionError, ValueError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ Uploader,
15
+ UploaderConfig,
16
+ UploadStager,
17
+ UploadStagerConfig,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
21
+ from unstructured_ingest.utils.data_prep import (
22
+ batch_generator,
23
+ flatten_dict,
24
+ get_enhanced_element_id,
25
+ )
26
+ from unstructured_ingest.utils.dep_check import requires_dependencies
27
+
28
+ from .utils import conform_string_to_dict
29
+
30
+ if TYPE_CHECKING:
31
+ from chromadb import Client
32
+
33
+ CONNECTOR_TYPE = "chroma"
34
+
35
+
36
+ class ChromaAccessConfig(AccessConfig):
37
+ settings: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
38
+ default=None, description="A dictionary of settings to communicate with the chroma server."
39
+ )
40
+ headers: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
41
+ default=None, description="A dictionary of headers to send to the Chroma server."
42
+ )
43
+
44
+
45
+ class ChromaConnectionConfig(ConnectionConfig):
46
+ access_config: Secret[ChromaAccessConfig] = Field(
47
+ default=ChromaAccessConfig(), validate_default=True
48
+ )
49
+ path: Optional[str] = Field(
50
+ default=None, description="Location where Chroma is persisted, if not connecting via http."
51
+ )
52
+ tenant: Optional[str] = Field(
53
+ default="default_tenant", description="The tenant to use for this client."
54
+ )
55
+ database: Optional[str] = Field(
56
+ default="default_database", description="The database to use for this client."
57
+ )
58
+ host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
59
+ port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
60
+ ssl: bool = Field(
61
+ default=False, description="Whether to use SSL to connect to the Chroma server."
62
+ )
63
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
64
+
65
+ @requires_dependencies(["chromadb"], extras="chroma")
66
+ def get_client(self) -> "Client":
67
+ import chromadb
68
+
69
+ access_config = self.access_config.get_secret_value()
70
+ if path := self.path:
71
+ return chromadb.PersistentClient(
72
+ path=path,
73
+ settings=access_config.settings,
74
+ tenant=self.tenant,
75
+ database=self.database,
76
+ )
77
+
78
+ elif (host := self.host) and (port := self.port):
79
+ return chromadb.HttpClient(
80
+ host=host,
81
+ port=str(port),
82
+ ssl=self.ssl,
83
+ headers=access_config.headers,
84
+ settings=access_config.settings,
85
+ tenant=self.tenant,
86
+ database=self.database,
87
+ )
88
+ else:
89
+ raise ValueError("Chroma connector requires either path or host and port to be set.")
90
+
91
+
92
+ class ChromaUploadStagerConfig(UploadStagerConfig):
93
+ pass
94
+
95
+
96
+ @dataclass
97
+ class ChromaUploadStager(UploadStager):
98
+ upload_stager_config: ChromaUploadStagerConfig = field(
99
+ default_factory=lambda: ChromaUploadStagerConfig()
100
+ )
101
+
102
+ @staticmethod
103
+ def parse_date_string(date_string: str) -> date:
104
+ try:
105
+ timestamp = float(date_string)
106
+ return datetime.fromtimestamp(timestamp)
107
+ except Exception as e:
108
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
109
+ return parser.parse(date_string)
110
+
111
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
112
+ """
113
+ Prepares dictionary in the format that Chroma requires
114
+ """
115
+ data = element_dict.copy()
116
+ return {
117
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
118
+ "embedding": data.pop("embeddings", None),
119
+ "document": data.pop("text", None),
120
+ "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
121
+ }
122
+
123
+
124
+ class ChromaUploaderConfig(UploaderConfig):
125
+ collection_name: str = Field(description="The name of the Chroma collection to write into.")
126
+ batch_size: int = Field(default=100, description="Number of records per batch")
127
+
128
+
129
+ @dataclass
130
+ class ChromaUploader(Uploader):
131
+ connector_type: str = CONNECTOR_TYPE
132
+ upload_config: ChromaUploaderConfig
133
+ connection_config: ChromaConnectionConfig
134
+
135
+ def precheck(self) -> None:
136
+ try:
137
+ self.connection_config.get_client()
138
+ except Exception as e:
139
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
140
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
141
+
142
+ @DestinationConnectionError.wrap
143
+ def upsert_batch(self, collection, batch):
144
+ try:
145
+ # Chroma wants lists even if there is only one element
146
+ # Upserting to prevent duplicates
147
+ collection.upsert(
148
+ ids=batch["ids"],
149
+ documents=batch["documents"],
150
+ embeddings=batch["embeddings"],
151
+ metadatas=batch["metadatas"],
152
+ )
153
+ except Exception as e:
154
+ raise DestinationConnectionError(f"chroma error: {e}") from e
155
+
156
+ @staticmethod
157
+ def prepare_chroma_list(chunk: tuple[dict[str, Any]]) -> dict[str, list[Any]]:
158
+ """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
159
+ ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
160
+ chroma_dict = {}
161
+ chroma_dict["ids"] = [x.get("id") for x in chunk]
162
+ chroma_dict["documents"] = [x.get("document") for x in chunk]
163
+ chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
164
+ chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
165
+ # Make sure all lists are of the same length
166
+ assert (
167
+ len(chroma_dict["ids"])
168
+ == len(chroma_dict["documents"])
169
+ == len(chroma_dict["embeddings"])
170
+ == len(chroma_dict["metadatas"])
171
+ )
172
+ return chroma_dict
173
+
174
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
175
+ logger.info(
176
+ f"writing {len(data)} objects to destination "
177
+ f"collection {self.upload_config.collection_name} "
178
+ f"at {self.connection_config.host}",
179
+ )
180
+ client = self.connection_config.get_client()
181
+
182
+ collection = client.get_or_create_collection(name=self.upload_config.collection_name)
183
+ for chunk in batch_generator(data, self.upload_config.batch_size):
184
+ self.upsert_batch(collection, self.prepare_chroma_list(chunk))
185
+
186
+
187
+ chroma_destination_entry = DestinationRegistryEntry(
188
+ connection_config=ChromaConnectionConfig,
189
+ uploader=ChromaUploader,
190
+ uploader_config=ChromaUploaderConfig,
191
+ upload_stager=ChromaUploadStager,
192
+ upload_stager_config=ChromaUploadStagerConfig,
193
+ )