unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,523 @@
1
+ import asyncio
2
+ import re
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Optional, Tuple
8
+
9
+ from pydantic import BaseModel, Field, Secret, field_validator
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ BatchFileData,
13
+ BatchItem,
14
+ FileData,
15
+ FileDataSourceMetadata,
16
+ )
17
+ from unstructured_ingest.error import (
18
+ DestinationConnectionError,
19
+ SourceConnectionError,
20
+ )
21
+ from unstructured_ingest.interfaces import (
22
+ AccessConfig,
23
+ ConnectionConfig,
24
+ )
25
+ from unstructured_ingest.interfaces.downloader import download_responses
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.processes.connector_registry import (
28
+ DestinationRegistryEntry,
29
+ SourceRegistryEntry,
30
+ )
31
+ from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
32
+ ElasticsearchBatchFileData,
33
+ ElasticsearchDownloader,
34
+ ElasticsearchDownloaderConfig,
35
+ ElasticsearchIndexer,
36
+ ElasticsearchIndexerConfig,
37
+ ElasticsearchUploader,
38
+ ElasticsearchUploaderConfig,
39
+ ElasticsearchUploadStager,
40
+ ElasticsearchUploadStagerConfig,
41
+ ElastisearchAdditionalMetadata,
42
+ )
43
+ from unstructured_ingest.utils.data_prep import batch_generator, generator_batching_wbytes
44
+ from unstructured_ingest.utils.dep_check import requires_dependencies
45
+
46
+ if TYPE_CHECKING:
47
+ from opensearchpy import OpenSearch
48
+
49
+ CONNECTOR_TYPE = "opensearch"
50
+
51
+ """OpenSearch connector - inherits from Elasticsearch connector (OpenSearch is an ES fork)."""
52
+
53
+ # Precompiled regex patterns for AWS hostname detection (GovCloud, China, standard)
54
+ _ES_PATTERN = re.compile(r"\.([a-z]{2}(?:-[a-z]+)+-\d+)\.es\.amazonaws\.com$")
55
+ _AOSS_PATTERN = re.compile(r"^[a-z0-9]+\.([a-z]{2}(?:-[a-z]+)+-\d+)\.aoss\.amazonaws\.com$")
56
+
57
+
58
+ def _run_coroutine(fn: Callable[..., Awaitable[Any]], *args: Any, **kwargs: Any) -> Any:
59
+ """Run an async function from sync context, handling existing event loops."""
60
+ try:
61
+ asyncio.get_running_loop()
62
+ except RuntimeError:
63
+ return asyncio.run(fn(*args, **kwargs))
64
+
65
+ with ThreadPoolExecutor(thread_name_prefix="opensearch") as pool:
66
+ return pool.submit(lambda: asyncio.run(fn(*args, **kwargs))).result()
67
+
68
+
69
+ class OpenSearchAccessConfig(AccessConfig):
70
+ password: Optional[str] = Field(default=None, description="password when using basic auth")
71
+ aws_access_key_id: Optional[str] = Field(
72
+ default=None,
73
+ description="AWS access key ID. When provided (with secret), IAM authentication is used. "
74
+ "Region and service type are auto-detected from the host URL.",
75
+ )
76
+ aws_secret_access_key: Optional[str] = Field(
77
+ default=None,
78
+ description="AWS secret access key. Required when aws_access_key_id is provided.",
79
+ )
80
+ aws_session_token: Optional[str] = Field(
81
+ default=None, description="AWS session token for temporary credentials (optional)"
82
+ )
83
+
84
+
85
+ def detect_aws_opensearch_config(host: str) -> Optional[Tuple[str, str]]:
86
+ """Auto-detect AWS region and service from OpenSearch hostname."""
87
+ clean_host = host.replace("https://", "").replace("http://", "")
88
+ clean_host = clean_host.split(":")[0]
89
+
90
+ match = _ES_PATTERN.search(clean_host)
91
+ if match:
92
+ return (match.group(1), "es")
93
+
94
+ match = _AOSS_PATTERN.search(clean_host)
95
+ if match:
96
+ return (match.group(1), "aoss")
97
+
98
+ return None
99
+
100
+
101
+ class OpenSearchClientInput(BaseModel):
102
+ http_auth: Secret[Optional[tuple[str, str]]] = None
103
+ hosts: Optional[list[str]] = None
104
+ use_ssl: bool = False
105
+ verify_certs: bool = False
106
+ ssl_show_warn: bool = False
107
+ ca_certs: Optional[str] = None
108
+ client_cert: Optional[str] = None
109
+ client_key: Optional[str] = None
110
+
111
+
112
+ class OpenSearchConnectionConfig(ConnectionConfig):
113
+ hosts: list[str] = Field(
114
+ ...,
115
+ min_length=1,
116
+ description="List of the OpenSearch hosts to connect",
117
+ examples=["http://localhost:9200"],
118
+ )
119
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
120
+ use_ssl: bool = Field(default=False, description="use ssl for the connection")
121
+ verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
122
+ ssl_show_warn: bool = Field(
123
+ default=False, description="show warning when verify certs is disabled"
124
+ )
125
+ ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
126
+ client_cert: Optional[Path] = Field(
127
+ default=None,
128
+ description="path to the file containing the private key and the certificate,"
129
+ " or cert only if using client_key",
130
+ )
131
+ client_key: Optional[Path] = Field(
132
+ default=None,
133
+ description="path to the file containing the private key"
134
+ " if using separate cert and key files",
135
+ )
136
+
137
+ access_config: Secret[OpenSearchAccessConfig]
138
+
139
+ @field_validator("hosts", mode="before")
140
+ @classmethod
141
+ def validate_hosts(cls, value):
142
+ if isinstance(value, str):
143
+ value = [value]
144
+ if not value:
145
+ raise ValueError("At least one OpenSearch host must be provided. ")
146
+ for host in value:
147
+ if not host or not host.strip():
148
+ raise ValueError("Host URL cannot be empty")
149
+ return value
150
+
151
+ def _has_aws_credentials(self) -> bool:
152
+ """Check if AWS IAM credentials are provided."""
153
+ access_config = self.access_config.get_secret_value()
154
+ has_access_key = access_config.aws_access_key_id is not None
155
+ has_secret_key = access_config.aws_secret_access_key is not None
156
+
157
+ # Validate: Either both credentials or neither - partial credentials are invalid
158
+ if has_access_key != has_secret_key: # XOR: exactly one is set
159
+ raise ValueError(
160
+ "AWS IAM authentication requires BOTH aws_access_key_id and aws_secret_access_key. "
161
+ f"Currently provided: aws_access_key_id={'set' if has_access_key else 'not set'}, "
162
+ f"aws_secret_access_key={'set' if has_secret_key else 'not set'}"
163
+ )
164
+
165
+ return has_access_key and has_secret_key
166
+
167
+ def _detect_and_validate_aws_config(self) -> Tuple[str, str]:
168
+ """Auto-detect AWS region and service from host URL."""
169
+ if not self.hosts:
170
+ raise ValueError("Host is required for AWS OpenSearch connection")
171
+
172
+ detected = detect_aws_opensearch_config(self.hosts[0])
173
+
174
+ if not detected:
175
+ raise ValueError(
176
+ f"Could not auto-detect AWS region and service from host: {self.hosts[0]}. "
177
+ f"Ensure your host URL follows AWS OpenSearch format: "
178
+ f"https://search-domain-xxx.REGION.es.amazonaws.com (for OpenSearch Service) or "
179
+ f"https://xxx.REGION.aoss.amazonaws.com (for OpenSearch Serverless)"
180
+ )
181
+
182
+ region, service = detected
183
+ logger.debug(
184
+ f"Auto-detected AWS configuration from host: region={region}, service={service}"
185
+ )
186
+ return region, service
187
+
188
+ @requires_dependencies(["opensearchpy", "boto3"], extras="opensearch")
189
+ async def _get_async_aws_auth(self):
190
+ """Create AWS SigV4 authentication handler for async clients."""
191
+ import boto3
192
+ from opensearchpy import AWSV4SignerAsyncAuth
193
+
194
+ access_config = self.access_config.get_secret_value()
195
+
196
+ session = boto3.Session(
197
+ aws_access_key_id=access_config.aws_access_key_id,
198
+ aws_secret_access_key=access_config.aws_secret_access_key,
199
+ aws_session_token=access_config.aws_session_token,
200
+ )
201
+ credentials = session.get_credentials()
202
+
203
+ if not credentials:
204
+ raise ValueError("Failed to obtain AWS credentials from provided keys")
205
+
206
+ return AWSV4SignerAsyncAuth(credentials, *self._detect_and_validate_aws_config())
207
+
208
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
209
+ async def get_async_client_kwargs(self) -> dict:
210
+ """Build AsyncOpenSearch client config (auto-detects IAM, basic auth, or cert auth)."""
211
+ access_config = self.access_config.get_secret_value()
212
+ client_input_kwargs = {}
213
+
214
+ if self.hosts:
215
+ client_input_kwargs["hosts"] = self.hosts
216
+ if self.use_ssl:
217
+ client_input_kwargs["use_ssl"] = self.use_ssl
218
+ if self.verify_certs:
219
+ client_input_kwargs["verify_certs"] = self.verify_certs
220
+ if self.ssl_show_warn:
221
+ client_input_kwargs["ssl_show_warn"] = self.ssl_show_warn
222
+ if self.ca_certs:
223
+ client_input_kwargs["ca_certs"] = str(self.ca_certs)
224
+ if self.client_cert:
225
+ client_input_kwargs["client_cert"] = str(self.client_cert)
226
+ if self.client_key:
227
+ client_input_kwargs["client_key"] = str(self.client_key)
228
+
229
+ if self._has_aws_credentials():
230
+ logger.debug("Using AWS IAM authentication")
231
+
232
+ # Must use http_async.AsyncHttpConnection for IAM auth handlers
233
+ from opensearchpy.connection.http_async import AsyncHttpConnection
234
+
235
+ client_input = OpenSearchClientInput(**client_input_kwargs)
236
+ client_kwargs = client_input.model_dump()
237
+ client_kwargs["http_auth"] = await self._get_async_aws_auth()
238
+ client_kwargs["connection_class"] = AsyncHttpConnection
239
+
240
+ elif self.username and access_config.password:
241
+ logger.debug("Using basic HTTP authentication")
242
+ client_input_kwargs["http_auth"] = (self.username, access_config.password)
243
+
244
+ client_input = OpenSearchClientInput(**client_input_kwargs)
245
+ client_kwargs = client_input.model_dump()
246
+ if client_input.http_auth:
247
+ client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
248
+
249
+ elif self.client_cert:
250
+ logger.debug("Using certificate-based authentication")
251
+ client_input = OpenSearchClientInput(**client_input_kwargs)
252
+ client_kwargs = client_input.model_dump()
253
+
254
+ else:
255
+ logger.warning("No authentication configured - connecting without credentials")
256
+ client_input = OpenSearchClientInput(**client_input_kwargs)
257
+ client_kwargs = client_input.model_dump()
258
+
259
+ # Retry and timeout configuration for resilience against transient errors
260
+ client_kwargs["max_retries"] = 3
261
+ client_kwargs["retry_on_status"] = [429, 502, 503]
262
+ client_kwargs["retry_on_timeout"] = True
263
+ client_kwargs["timeout"] = 60
264
+
265
+ return {k: v for k, v in client_kwargs.items() if v is not None}
266
+
267
+
268
+ class OpenSearchIndexerConfig(ElasticsearchIndexerConfig):
269
+ pass
270
+
271
+
272
+ @dataclass
273
+ class OpenSearchIndexer(ElasticsearchIndexer):
274
+ connection_config: OpenSearchConnectionConfig
275
+ index_config: OpenSearchIndexerConfig
276
+ client: "OpenSearch" = field(init=False)
277
+
278
+ def is_async(self) -> bool:
279
+ """Signal pipeline to use async execution."""
280
+ return True
281
+
282
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
283
+ def precheck(self) -> None:
284
+ """Validate connection and index (sync wrapper required by pipeline framework)."""
285
+
286
+ async def _async_precheck():
287
+ from opensearchpy import AsyncOpenSearch
288
+
289
+ try:
290
+ async with AsyncOpenSearch(
291
+ **await self.connection_config.get_async_client_kwargs()
292
+ ) as client:
293
+ # Use get_alias (GET) instead of exists (HEAD) - HEAD has IAM signing issues
294
+ # Also respects AWS FGAC by checking only the specific index
295
+ await client.indices.get_alias(index=self.index_config.index_name)
296
+ except Exception as e:
297
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
298
+ raise SourceConnectionError(f"failed to validate connection: {e}")
299
+
300
+ _run_coroutine(_async_precheck)
301
+
302
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
303
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[ElasticsearchBatchFileData, None]:
304
+ """Async indexing for all authentication types."""
305
+ ids = list(await self._get_doc_ids_async())
306
+ for batch in batch_generator(ids, self.index_config.batch_size):
307
+ batch_items = [BatchItem(identifier=b) for b in batch]
308
+ url = f"{self.connection_config.hosts[0]}/{self.index_config.index_name}"
309
+ display_name = (
310
+ f"url={url}, batch_size={len(batch_items)} "
311
+ f"ids={batch_items[0].identifier}..{batch_items[-1].identifier}"
312
+ )
313
+ yield ElasticsearchBatchFileData(
314
+ connector_type=CONNECTOR_TYPE,
315
+ metadata=FileDataSourceMetadata(
316
+ url=url,
317
+ date_processed=str(time()),
318
+ ),
319
+ additional_metadata=ElastisearchAdditionalMetadata(
320
+ index_name=self.index_config.index_name,
321
+ ),
322
+ batch_items=batch_items,
323
+ display_name=display_name,
324
+ )
325
+
326
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
327
+ async def _get_doc_ids_async(self) -> set[str]:
328
+ """Fetch document IDs using async_scan."""
329
+ from opensearchpy import AsyncOpenSearch
330
+ from opensearchpy.helpers import async_scan
331
+
332
+ scan_query = {"stored_fields": [], "query": {"match_all": {}}}
333
+
334
+ async with AsyncOpenSearch(
335
+ **await self.connection_config.get_async_client_kwargs()
336
+ ) as client:
337
+ doc_ids = set()
338
+ async for hit in async_scan(
339
+ client,
340
+ query=scan_query,
341
+ scroll="1m",
342
+ index=self.index_config.index_name,
343
+ ):
344
+ doc_ids.add(hit["_id"])
345
+ return doc_ids
346
+
347
+
348
+ class OpenSearchDownloaderConfig(ElasticsearchDownloaderConfig):
349
+ pass
350
+
351
+
352
+ @dataclass
353
+ class OpenSearchDownloader(ElasticsearchDownloader):
354
+ connection_config: OpenSearchConnectionConfig
355
+ download_config: OpenSearchDownloaderConfig
356
+ connector_type: str = CONNECTOR_TYPE
357
+
358
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
359
+ async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
360
+ """Download documents from OpenSearch."""
361
+ from opensearchpy import AsyncOpenSearch
362
+ from opensearchpy.helpers import async_scan
363
+
364
+ elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
365
+
366
+ index_name: str = elasticsearch_filedata.additional_metadata.index_name
367
+ ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
368
+
369
+ scan_query = {
370
+ "version": True,
371
+ "query": {"ids": {"values": ids}},
372
+ }
373
+
374
+ # Only add _source if fields are explicitly specified (avoids AWS FGAC issues)
375
+ if self.download_config.fields:
376
+ scan_query["_source"] = self.download_config.fields
377
+
378
+ download_responses = []
379
+ async with AsyncOpenSearch(
380
+ **await self.connection_config.get_async_client_kwargs()
381
+ ) as client:
382
+ async for result in async_scan(
383
+ client,
384
+ query=scan_query,
385
+ scroll="1m",
386
+ index=index_name,
387
+ ):
388
+ download_responses.append(
389
+ self.generate_download_response(
390
+ result=result, index_name=index_name, file_data=elasticsearch_filedata
391
+ )
392
+ )
393
+ return download_responses
394
+
395
+
396
+ class OpenSearchUploaderConfig(ElasticsearchUploaderConfig):
397
+ batch_size_bytes: int = Field(
398
+ default=5_000_000,
399
+ description="Size limit (in bytes) for each batch of items to be uploaded. "
400
+ "Default is 5MB, lower than Elasticsearch default to accommodate "
401
+ "AWS OpenSearch cluster rate limits.",
402
+ )
403
+
404
+
405
+ @dataclass
406
+ class OpenSearchUploader(ElasticsearchUploader):
407
+ connection_config: OpenSearchConnectionConfig
408
+ upload_config: OpenSearchUploaderConfig
409
+ connector_type: str = CONNECTOR_TYPE
410
+
411
+ def is_async(self) -> bool:
412
+ """Signal pipeline to use async execution."""
413
+ return True
414
+
415
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
416
+ def precheck(self) -> None:
417
+ """Validate connection and index (sync wrapper required by pipeline framework)."""
418
+
419
+ async def _async_precheck():
420
+ from opensearchpy import AsyncOpenSearch
421
+
422
+ try:
423
+ async with AsyncOpenSearch(
424
+ **await self.connection_config.get_async_client_kwargs()
425
+ ) as client:
426
+ # Use get_alias (GET) instead of exists (HEAD) - HEAD has IAM signing issues
427
+ # Also respects AWS FGAC by checking only the specific index
428
+ await client.indices.get_alias(index=self.upload_config.index_name)
429
+ except Exception as e:
430
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
431
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
432
+
433
+ _run_coroutine(_async_precheck)
434
+
435
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
436
+ async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
437
+ """Upload data to OpenSearch using async_bulk."""
438
+ from opensearchpy import AsyncOpenSearch
439
+ from opensearchpy.exceptions import TransportError
440
+ from opensearchpy.helpers import async_bulk
441
+
442
+ logger.debug(
443
+ f"writing {len(data)} elements to index {self.upload_config.index_name} "
444
+ f"at {self.connection_config.hosts} "
445
+ f"with batch size (bytes) {self.upload_config.batch_size_bytes}"
446
+ )
447
+
448
+ async with AsyncOpenSearch(
449
+ **await self.connection_config.get_async_client_kwargs()
450
+ ) as client:
451
+ for batch in generator_batching_wbytes(
452
+ data, batch_size_limit_bytes=self.upload_config.batch_size_bytes
453
+ ):
454
+ # Retry with delay for rate limiting (429 errors)
455
+ max_attempts = 3
456
+ for attempt in range(max_attempts):
457
+ try:
458
+ success, failed = await async_bulk(
459
+ client=client,
460
+ actions=batch,
461
+ chunk_size=len(batch),
462
+ max_chunk_bytes=self.upload_config.batch_size_bytes,
463
+ raise_on_error=False,
464
+ )
465
+ break
466
+ except Exception as e:
467
+ # Check for rate limiting: precise type check, then string fallback
468
+ is_rate_limited = (
469
+ isinstance(e, TransportError) and e.status_code == 429
470
+ ) or "429" in str(e) or "too many requests" in str(e).lower()
471
+
472
+ if attempt < max_attempts - 1 and is_rate_limited:
473
+ logger.warning(
474
+ f"Rate limited (attempt {attempt + 1}/{max_attempts}), "
475
+ f"waiting 5s before retry: {e}"
476
+ )
477
+ await asyncio.sleep(5)
478
+ else:
479
+ logger.error(f"Batch upload failed: {e}")
480
+ raise DestinationConnectionError(str(e))
481
+
482
+ # Check for document failures (outside try to avoid catching our own exception)
483
+ if failed:
484
+ logger.error(
485
+ f"Batch upload had {len(failed)} failures out of {len(batch)}. "
486
+ f"Failed items: {failed[:5]}"
487
+ )
488
+ raise DestinationConnectionError(
489
+ f"Failed to upload {len(failed)} out of {len(batch)} documents"
490
+ )
491
+
492
+ logger.debug(
493
+ f"uploaded batch of {len(batch)} elements to {self.upload_config.index_name}"
494
+ )
495
+
496
+ logger.info(f"Upload complete: {len(data)} elements to {self.upload_config.index_name}")
497
+
498
+
499
+ class OpenSearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
500
+ pass
501
+
502
+
503
+ @dataclass
504
+ class OpenSearchUploadStager(ElasticsearchUploadStager):
505
+ upload_stager_config: OpenSearchUploadStagerConfig
506
+
507
+
508
+ opensearch_source_entry = SourceRegistryEntry(
509
+ connection_config=OpenSearchConnectionConfig,
510
+ indexer=OpenSearchIndexer,
511
+ indexer_config=OpenSearchIndexerConfig,
512
+ downloader=OpenSearchDownloader,
513
+ downloader_config=OpenSearchDownloaderConfig,
514
+ )
515
+
516
+
517
+ opensearch_destination_entry = DestinationRegistryEntry(
518
+ connection_config=OpenSearchConnectionConfig,
519
+ upload_stager_config=OpenSearchUploadStagerConfig,
520
+ upload_stager=OpenSearchUploadStager,
521
+ uploader_config=OpenSearchUploaderConfig,
522
+ uploader=OpenSearchUploader,
523
+ )
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
9
+ from .azure import azure_destination_entry, azure_source_entry
10
+ from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
11
+ from .box import box_destination_entry, box_source_entry
12
+ from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
13
+ from .dropbox import dropbox_destination_entry, dropbox_source_entry
14
+ from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
15
+ from .gcs import gcs_destination_entry, gcs_source_entry
16
+ from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
17
+ from .s3 import s3_destination_entry, s3_source_entry
18
+ from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
19
+ from .sftp import sftp_destination_entry, sftp_source_entry
20
+
21
+ add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
22
+ add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)
23
+
24
+ add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
25
+ add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)
26
+
27
+ add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
28
+ add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)
29
+
30
+ add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
31
+ add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)
32
+
33
+ add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
34
+ add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)
35
+
36
+ add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
37
+ add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)