unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,592 @@
1
+ import asyncio
2
+ import csv
3
+ import hashlib
4
+ import os
5
+ import re
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from time import time
9
+ from typing import TYPE_CHECKING, Any, Generator, Optional
10
+
11
+ from pydantic import BaseModel, Field, Secret
12
+
13
+ from unstructured_ingest.__version__ import __version__ as integration_version
14
+ from unstructured_ingest.data_types.file_data import (
15
+ BatchFileData,
16
+ BatchItem,
17
+ FileData,
18
+ FileDataSourceMetadata,
19
+ SourceIdentifiers,
20
+ )
21
+ from unstructured_ingest.error import (
22
+ DestinationConnectionError,
23
+ SourceConnectionError,
24
+ SourceConnectionNetworkError,
25
+ )
26
+ from unstructured_ingest.interfaces import (
27
+ AccessConfig,
28
+ ConnectionConfig,
29
+ Downloader,
30
+ DownloaderConfig,
31
+ DownloadResponse,
32
+ Indexer,
33
+ IndexerConfig,
34
+ Uploader,
35
+ UploaderConfig,
36
+ UploadStager,
37
+ UploadStagerConfig,
38
+ download_responses,
39
+ )
40
+ from unstructured_ingest.logger import logger
41
+ from unstructured_ingest.processes.connector_registry import (
42
+ DestinationRegistryEntry,
43
+ SourceRegistryEntry,
44
+ )
45
+ from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
46
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
47
+ from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
48
+ from unstructured_ingest.utils.dep_check import requires_dependencies
49
+ from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
50
+
51
+ if TYPE_CHECKING:
52
+ from astrapy import AsyncCollection as AstraDBAsyncCollection
53
+ from astrapy import Collection as AstraDBCollection
54
+ from astrapy import DataAPIClient as AstraDBClient
55
+ from astrapy import Database as AstraDB
56
+
57
+
58
+ CONNECTOR_TYPE = "astradb"
59
+
60
+ MAX_CONTENT_PARAM_BYTE_SIZE = 8000
61
+
62
+
63
+ class AstraDBAdditionalMetadata(BaseModel):
64
+ collection_name: str
65
+ keyspace: Optional[str] = None
66
+
67
+
68
+ class AstraDBBatchFileData(BatchFileData):
69
+ additional_metadata: AstraDBAdditionalMetadata
70
+
71
+
72
+ class AstraDBAccessConfig(AccessConfig):
73
+ token: str = Field(description="Astra DB Token with access to the database.")
74
+ api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
75
+
76
+
77
+ class AstraDBConnectionConfig(ConnectionConfig):
78
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
79
+ access_config: Secret[AstraDBAccessConfig]
80
+
81
+ @requires_dependencies(["astrapy"], extras="astradb")
82
+ def get_client(self) -> "AstraDBClient":
83
+ from astrapy import DataAPIClient as AstraDBClient
84
+
85
+ # Create a client object to interact with the Astra DB
86
+ # caller_name/version for Astra DB tracking
87
+ user_agent = os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
88
+ return AstraDBClient(callers=[(user_agent, integration_version)])
89
+
90
+
91
+ def get_astra_db(
92
+ connection_config: AstraDBConnectionConfig,
93
+ keyspace: str,
94
+ ) -> "AstraDB":
95
+ # Build the Astra DB object.
96
+ access_configs = connection_config.access_config.get_secret_value()
97
+
98
+ # Create a client object to interact with the Astra DB
99
+ # caller_name/version for Astra DB tracking
100
+ client = connection_config.get_client()
101
+
102
+ # Get the database object
103
+ astra_db = client.get_database(
104
+ api_endpoint=access_configs.api_endpoint,
105
+ token=access_configs.token,
106
+ keyspace=keyspace,
107
+ )
108
+ return astra_db
109
+
110
+
111
+ def get_astra_collection(
112
+ connection_config: AstraDBConnectionConfig,
113
+ collection_name: str,
114
+ keyspace: str,
115
+ ) -> "AstraDBCollection":
116
+ astra_db = get_astra_db(connection_config=connection_config, keyspace=keyspace)
117
+
118
+ # astradb will return a collection object in all cases (even if it doesn't exist)
119
+ astra_db_collection = astra_db.get_collection(name=collection_name)
120
+
121
+ return astra_db_collection
122
+
123
+
124
+ async def get_async_astra_collection(
125
+ connection_config: AstraDBConnectionConfig,
126
+ collection_name: str,
127
+ keyspace: str,
128
+ ) -> "AstraDBAsyncCollection":
129
+ # Build the Astra DB object.
130
+ access_configs = connection_config.access_config.get_secret_value()
131
+
132
+ # Create a client object to interact with the Astra DB
133
+ client = connection_config.get_client()
134
+
135
+ # Get the async database object
136
+ async_astra_db = client.get_async_database(
137
+ api_endpoint=access_configs.api_endpoint,
138
+ token=access_configs.token,
139
+ keyspace=keyspace,
140
+ )
141
+
142
+ # Get async collection from AsyncDatabase
143
+ async_astra_db_collection = async_astra_db.get_collection(name=collection_name)
144
+ return async_astra_db_collection
145
+
146
+
147
+ class AstraDBIndexerConfig(IndexerConfig):
148
+ collection_name: str = Field(
149
+ description="The name of the Astra DB collection. "
150
+ "Note that the collection name must only include letters, "
151
+ "numbers, and underscores."
152
+ )
153
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
154
+ batch_size: int = Field(default=20, description="Number of records per batch")
155
+
156
+
157
+ @dataclass
158
+ class AstraDBIndexer(Indexer):
159
+ connection_config: AstraDBConnectionConfig
160
+ index_config: AstraDBIndexerConfig
161
+
162
+ def get_collection(self) -> "AstraDBCollection":
163
+ return get_astra_collection(
164
+ connection_config=self.connection_config,
165
+ collection_name=self.index_config.collection_name,
166
+ keyspace=self.index_config.keyspace,
167
+ )
168
+
169
+ def precheck(self) -> None:
170
+ try:
171
+ self.get_collection().options()
172
+ except Exception as e:
173
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
174
+ raise SourceConnectionError(f"failed to validate connection: {e}")
175
+
176
+ def _get_doc_ids(self) -> set[str]:
177
+ """Fetches all document ids in an index"""
178
+ # Get the collection
179
+ collection = self.get_collection()
180
+
181
+ # Perform the find operation to get all items
182
+ astra_db_docs_cursor = collection.find({}, projection={"_id": True})
183
+
184
+ # Iterate over the cursor
185
+ astra_db_docs = []
186
+ for result in astra_db_docs_cursor:
187
+ astra_db_docs.append(result)
188
+
189
+ # Create file data for each astra record
190
+ ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
191
+
192
+ return set(ids)
193
+
194
+ def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
195
+ all_ids = self._get_doc_ids()
196
+ ids = list(all_ids)
197
+ id_batches = batch_generator(ids, self.index_config.batch_size)
198
+ for batch in id_batches:
199
+ batch_items = [BatchItem(identifier=b) for b in batch]
200
+ display_name = (
201
+ f"{self.index_config.collection_name}-{self.index_config.keyspace}"
202
+ f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
203
+ )
204
+ fd = AstraDBBatchFileData(
205
+ connector_type=CONNECTOR_TYPE,
206
+ metadata=FileDataSourceMetadata(
207
+ date_processed=str(time()),
208
+ ),
209
+ additional_metadata=AstraDBAdditionalMetadata(
210
+ collection_name=self.index_config.collection_name,
211
+ keyspace=self.index_config.keyspace,
212
+ ),
213
+ batch_items=batch_items,
214
+ display_name=display_name,
215
+ )
216
+ yield fd
217
+
218
+
219
+ class AstraDBDownloaderConfig(DownloaderConfig):
220
+ fields: list[str] = field(default_factory=list)
221
+
222
+
223
+ @dataclass
224
+ class AstraDBDownloader(Downloader):
225
+ connection_config: AstraDBConnectionConfig
226
+ download_config: AstraDBDownloaderConfig
227
+ connector_type: str = CONNECTOR_TYPE
228
+
229
+ def is_async(self) -> bool:
230
+ return True
231
+
232
+ def get_identifier(self, record_id: str) -> str:
233
+ f = f"{record_id}"
234
+ if self.download_config.fields:
235
+ f = "{}-{}".format(
236
+ f,
237
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
238
+ )
239
+ return f
240
+
241
+ def write_astra_result_to_csv(self, astra_result: dict, download_path: str) -> None:
242
+ with open(download_path, "w", encoding="utf8") as f:
243
+ writer = csv.writer(f)
244
+ writer.writerow(astra_result.keys())
245
+ writer.writerow(astra_result.values())
246
+
247
+ def generate_download_response(
248
+ self, result: dict, file_data: AstraDBBatchFileData
249
+ ) -> DownloadResponse:
250
+ record_id = result["_id"]
251
+ filename_id = self.get_identifier(record_id=record_id)
252
+ filename = f"{filename_id}.csv" # csv to preserve column info
253
+ download_path = self.download_dir / Path(filename)
254
+ logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
255
+ download_path.parent.mkdir(parents=True, exist_ok=True)
256
+ try:
257
+ self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
258
+ except Exception as e:
259
+ logger.error(
260
+ f"failed to download from record {record_id} to {download_path}: {e}",
261
+ exc_info=True,
262
+ )
263
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
264
+
265
+ # modify input file_data for download_response
266
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
267
+ cast_file_data = FileData.cast(file_data=file_data)
268
+ cast_file_data.identifier = filename
269
+ cast_file_data.metadata.date_processed = str(time())
270
+ cast_file_data.metadata.record_locator = {"document_id": record_id}
271
+ return super().generate_download_response(
272
+ file_data=cast_file_data, download_path=download_path
273
+ )
274
+
275
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
276
+ raise NotImplementedError("Use astradb run_async instead")
277
+
278
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
279
+ # Get metadata from file_data
280
+ astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
281
+ ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
282
+ collection_name: str = astra_file_data.additional_metadata.collection_name
283
+ keyspace: str = astra_file_data.additional_metadata.keyspace
284
+
285
+ # Retrieve results from async collection
286
+ download_responses = []
287
+ async_astra_collection = await get_async_astra_collection(
288
+ connection_config=self.connection_config,
289
+ collection_name=collection_name,
290
+ keyspace=keyspace,
291
+ )
292
+ async for result in async_astra_collection.find({"_id": {"$in": ids}}):
293
+ download_responses.append(
294
+ self.generate_download_response(result=result, file_data=astra_file_data)
295
+ )
296
+ return download_responses
297
+
298
+
299
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
300
+ flatten_metadata: Optional[bool] = Field(
301
+ default=False, description="Move metadata to top level of the record."
302
+ )
303
+ astra_generated_embeddings: bool = Field(
304
+ default=False,
305
+ description="Select this if you've configured an embedding provider integration "
306
+ "for your collection. Content will be inserted into the $vectorize field and "
307
+ "embeddings will be generated externally.",
308
+ )
309
+ enable_lexical_search: bool = Field(
310
+ default=False,
311
+ description="Select this to insert content into the $lexical field "
312
+ "for lexicographical or hybrid search.",
313
+ )
314
+
315
+
316
+ @dataclass
317
+ class AstraDBUploadStager(UploadStager):
318
+ upload_stager_config: AstraDBUploadStagerConfig = field(
319
+ default_factory=lambda: AstraDBUploadStagerConfig()
320
+ )
321
+
322
+ def truncate_dict_elements(self, element_dict: dict) -> None:
323
+ text = element_dict.pop("text", None)
324
+ if text is not None:
325
+ element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
326
+ metadata = element_dict.get("metadata")
327
+ if metadata is not None and isinstance(metadata, dict):
328
+ text_as_html = element_dict["metadata"].pop("text_as_html", None)
329
+ if text_as_html is not None:
330
+ element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
331
+ text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
332
+ )
333
+ metadata["original_elements"] = format_and_truncate_orig_elements(element_dict)
334
+ metadata.pop("orig_elements", None)
335
+
336
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
337
+ self.truncate_dict_elements(element_dict)
338
+ if self.upload_stager_config.flatten_metadata:
339
+ # move metadata to top level so it isn't nested in metadata column
340
+ metadata = element_dict.pop("metadata", None)
341
+ if metadata:
342
+ element_dict.update(metadata)
343
+
344
+ content = element_dict.pop("text", None)
345
+ embeddings = element_dict.pop("embeddings", None)
346
+
347
+ result = {
348
+ "content": content,
349
+ RECORD_ID_LABEL: file_data.identifier,
350
+ "metadata": element_dict,
351
+ }
352
+
353
+ # (Austin): We support bring-your-own embeddings XOR Astra-generated embeddings.
354
+ # Using neither /is/ a valid state, but for now we're enforcing Astra as a vector store.
355
+ has_unstructured_embeddings = embeddings is not None and len(embeddings) > 0
356
+ generate_embeddings = self.upload_stager_config.astra_generated_embeddings
357
+
358
+ if not has_unstructured_embeddings and not generate_embeddings:
359
+ raise ValueError(
360
+ "No vectors provided. "
361
+ "Please enable an Unstructured embedding provider or "
362
+ "configure Astra to generate embeddings."
363
+ )
364
+ elif has_unstructured_embeddings and generate_embeddings:
365
+ raise ValueError(
366
+ "Cannot use Unstructured embeddings and Astra-generated embeddings simultaneously. "
367
+ "Please disable Astra generated embeddings or remove the Unstructured embedder."
368
+ )
369
+ elif generate_embeddings:
370
+ result["$vectorize"] = content
371
+ elif has_unstructured_embeddings:
372
+ result["$vector"] = embeddings
373
+
374
+ if self.upload_stager_config.enable_lexical_search:
375
+ result["$lexical"] = content
376
+
377
+ return result
378
+
379
+
380
+ class AstraDBUploaderConfig(UploaderConfig):
381
+ collection_name: Optional[str] = Field(
382
+ description="The name of the Astra DB collection. "
383
+ "Note that the collection name must only include letters, "
384
+ "numbers, and underscores.",
385
+ default=None,
386
+ )
387
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
388
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
389
+ default=None,
390
+ description="The indexing policy to use for the collection.",
391
+ examples=['{"deny": ["metadata"]}'],
392
+ )
393
+ batch_size: int = Field(default=20, description="Number of records per batch")
394
+ max_concurrent_batches: int = Field(
395
+ default=10,
396
+ description="Maximum number of batches to upload concurrently. "
397
+ "Lower values reduce API load but may be slower. "
398
+ "Higher values may cause timeouts with very large uploads.",
399
+ )
400
+ record_id_key: str = Field(
401
+ default=RECORD_ID_LABEL,
402
+ description="searchable key to find entries for the same record on previous runs",
403
+ )
404
+ binary_encode_vectors: bool = Field(
405
+ default=True,
406
+ description="Upload vectors in a binary format. If set to False, "
407
+ "vectors will be a human-readable list of floats. "
408
+ "WARNING: Disabling this option may make the upload slower!",
409
+ )
410
+
411
+
412
+ @dataclass
413
+ class AstraDBUploader(Uploader):
414
+ connection_config: AstraDBConnectionConfig
415
+ upload_config: AstraDBUploaderConfig
416
+ connector_type: str = CONNECTOR_TYPE
417
+
418
+ def is_async(self) -> bool:
419
+ return True
420
+
421
+ def init(self, **kwargs: Any) -> None:
422
+ self.create_destination(**kwargs)
423
+
424
+ @requires_dependencies(["astrapy"], extras="astradb")
425
+ def precheck(self) -> None:
426
+ try:
427
+ if self.upload_config.collection_name:
428
+ collection = get_astra_collection(
429
+ connection_config=self.connection_config,
430
+ collection_name=self.upload_config.collection_name,
431
+ keyspace=self.upload_config.keyspace,
432
+ )
433
+ collection.options()
434
+ else:
435
+ # check for db connection only if collection name is not provided
436
+ get_astra_db(
437
+ connection_config=self.connection_config,
438
+ keyspace=self.upload_config.keyspace,
439
+ )
440
+ except Exception as e:
441
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
442
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
443
+
444
+ def _collection_exists(self, collection_name: str):
445
+ collection = get_astra_collection(
446
+ connection_config=self.connection_config,
447
+ collection_name=collection_name,
448
+ keyspace=self.upload_config.keyspace,
449
+ )
450
+
451
+ try:
452
+ collection.options()
453
+ return True
454
+ except RuntimeError as e:
455
+ if "not found" in str(e):
456
+ return False
457
+ raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
458
+ except Exception as e:
459
+ logger.error(f"failed to check if astra collection exists : {e}")
460
+ raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
461
+
462
+ def format_destination_name(self, destination_name: str) -> str:
463
+ # AstraDB collection naming requirements:
464
+ # must be below 50 characters
465
+ # must be lowercase alphanumeric and underscores only
466
+ formatted = re.sub(r"[^a-z0-9]", "_", destination_name.lower())
467
+ return formatted
468
+
469
+ def create_destination(
470
+ self,
471
+ destination_name: str = "unstructuredautocreated",
472
+ vector_length: Optional[int] = None,
473
+ similarity_metric: Optional[str] = "cosine",
474
+ **kwargs: Any,
475
+ ) -> bool:
476
+ destination_name = self.format_destination_name(destination_name)
477
+ collection_name = self.upload_config.collection_name or destination_name
478
+ self.upload_config.collection_name = collection_name
479
+
480
+ if not self._collection_exists(collection_name):
481
+ from astrapy.info import CollectionDefinition
482
+
483
+ astra_db = get_astra_db(
484
+ connection_config=self.connection_config, keyspace=self.upload_config.keyspace
485
+ )
486
+ logger.info(
487
+ f"creating default astra collection '{collection_name}' with dimension "
488
+ f"{vector_length} and metric {similarity_metric}"
489
+ )
490
+ definition = (
491
+ CollectionDefinition.builder()
492
+ .set_vector_dimension(dimension=vector_length)
493
+ .set_vector_metric(similarity_metric)
494
+ .build()
495
+ )
496
+ (astra_db.create_collection(collection_name, definition=definition),)
497
+ return True
498
+ logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
499
+ return False
500
+
501
+ async def delete_by_record_id(self, collection: "AstraDBAsyncCollection", file_data: FileData):
502
+ logger.debug(
503
+ f"deleting records from collection {collection.name} "
504
+ f"with {self.upload_config.record_id_key} "
505
+ f"set to {file_data.identifier}"
506
+ )
507
+ delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
508
+ delete_resp = await collection.delete_many(filter=delete_filter)
509
+ logger.debug(
510
+ f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
511
+ )
512
+
513
+ @requires_dependencies(["astrapy"], extras="astradb")
514
+ async def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
515
+ logger.info(
516
+ f"writing {len(data)} objects to destination "
517
+ f"collection {self.upload_config.collection_name}"
518
+ )
519
+
520
+ astra_db_batch_size = self.upload_config.batch_size
521
+ max_concurrent = self.upload_config.max_concurrent_batches
522
+ async_astra_collection = await get_async_astra_collection(
523
+ connection_config=self.connection_config,
524
+ collection_name=self.upload_config.collection_name,
525
+ keyspace=self.upload_config.keyspace,
526
+ )
527
+
528
+ # If we're disabling binary encoded vectors, update the collection settings
529
+ if not self.upload_config.binary_encode_vectors:
530
+ from astrapy.api_options import APIOptions, SerdesOptions
531
+
532
+ async_astra_collection = async_astra_collection.with_options(
533
+ api_options=APIOptions(serdes_options=SerdesOptions(binary_encode_vectors=False))
534
+ )
535
+
536
+ await self.delete_by_record_id(collection=async_astra_collection, file_data=file_data)
537
+
538
+ batches = list(batch_generator(data, astra_db_batch_size))
539
+ total_batches = len(batches)
540
+ logger.info(
541
+ f"Uploading {len(data)} elements in {total_batches} batches "
542
+ f"(batch_size={astra_db_batch_size}, max_concurrent={max_concurrent})"
543
+ )
544
+
545
+ semaphore = asyncio.Semaphore(max_concurrent)
546
+
547
+ log_interval = 100
548
+ async def upload_batch_with_semaphore(batch: tuple[dict, ...], batch_num: int) -> None:
549
+ async with semaphore:
550
+ try:
551
+ await async_astra_collection.insert_many(batch)
552
+ if (batch_num + 1) % log_interval == 0 or batch_num == total_batches - 1:
553
+ logger.debug(
554
+ f"Upload progress: {batch_num + 1}/{total_batches} batches completed "
555
+ f"({(batch_num + 1) / total_batches * 100:.1f}%)"
556
+ )
557
+ except Exception as e:
558
+ logger.error(
559
+ f"Failed to upload batch {batch_num + 1}/{total_batches}: {e}"
560
+ )
561
+ raise
562
+
563
+ await asyncio.gather(
564
+ *[
565
+ upload_batch_with_semaphore(batch, batch_num)
566
+ for batch_num, batch in enumerate(batches)
567
+ ]
568
+ )
569
+
570
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
571
+ data = get_json_data(path=path)
572
+ await self.run_data(data=data, file_data=file_data)
573
+
574
+ def run(self, **kwargs: Any) -> Any:
575
+ raise NotImplementedError("Use astradb run_async instead")
576
+
577
+
578
+ astra_db_source_entry = SourceRegistryEntry(
579
+ indexer=AstraDBIndexer,
580
+ indexer_config=AstraDBIndexerConfig,
581
+ downloader=AstraDBDownloader,
582
+ downloader_config=AstraDBDownloaderConfig,
583
+ connection_config=AstraDBConnectionConfig,
584
+ )
585
+
586
+ astra_db_destination_entry = DestinationRegistryEntry(
587
+ connection_config=AstraDBConnectionConfig,
588
+ upload_stager_config=AstraDBUploadStagerConfig,
589
+ upload_stager=AstraDBUploadStager,
590
+ uploader_config=AstraDBUploaderConfig,
591
+ uploader=AstraDBUploader,
592
+ )