unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,478 @@
1
+ import collections
2
+ import hashlib
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
+
9
+ from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ BatchFileData,
13
+ BatchItem,
14
+ FileData,
15
+ FileDataSourceMetadata,
16
+ SourceIdentifiers,
17
+ )
18
+ from unstructured_ingest.error import (
19
+ DestinationConnectionError,
20
+ SourceConnectionError,
21
+ SourceConnectionNetworkError,
22
+ UnstructuredIngestError,
23
+ )
24
+ from unstructured_ingest.interfaces import (
25
+ AccessConfig,
26
+ ConnectionConfig,
27
+ Downloader,
28
+ DownloaderConfig,
29
+ DownloadResponse,
30
+ Indexer,
31
+ IndexerConfig,
32
+ Uploader,
33
+ UploaderConfig,
34
+ UploadStager,
35
+ UploadStagerConfig,
36
+ download_responses,
37
+ )
38
+ from unstructured_ingest.logger import logger
39
+ from unstructured_ingest.processes.connector_registry import (
40
+ DestinationRegistryEntry,
41
+ SourceRegistryEntry,
42
+ )
43
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
44
+ from unstructured_ingest.utils.data_prep import (
45
+ batch_generator,
46
+ flatten_dict,
47
+ generator_batching_wbytes,
48
+ get_enhanced_element_id,
49
+ )
50
+ from unstructured_ingest.utils.dep_check import requires_dependencies
51
+
52
+ if TYPE_CHECKING:
53
+ from elasticsearch import Elasticsearch as ElasticsearchClient
54
+
55
+ CONNECTOR_TYPE = "elasticsearch"
56
+
57
+
58
+ class ElastisearchAdditionalMetadata(BaseModel):
59
+ index_name: str
60
+
61
+
62
+ class ElasticsearchBatchFileData(BatchFileData):
63
+ additional_metadata: ElastisearchAdditionalMetadata
64
+
65
+
66
+ class ElasticsearchAccessConfig(AccessConfig):
67
+ password: Optional[str] = Field(
68
+ default=None, description="password when using basic auth or connecting to a cloud instance"
69
+ )
70
+ es_api_key: Optional[str] = Field(default=None, description="api key used for authentication")
71
+ bearer_auth: Optional[str] = Field(
72
+ default=None, description="bearer token used for HTTP bearer authentication"
73
+ )
74
+ ssl_assert_fingerprint: Optional[str] = Field(
75
+ default=None, description="SHA256 fingerprint value"
76
+ )
77
+
78
+
79
+ class ElasticsearchClientInput(BaseModel):
80
+ hosts: Optional[list[str]] = None
81
+ cloud_id: Optional[str] = None
82
+ ca_certs: Optional[Path] = None
83
+ basic_auth: Optional[Secret[tuple[str, str]]] = None
84
+ api_key: Optional[Union[Secret[tuple[str, str]], SecretStr]] = None
85
+
86
+
87
+ class ElasticsearchConnectionConfig(ConnectionConfig):
88
+ hosts: Optional[list[str]] = Field(
89
+ default=None,
90
+ description="list of the Elasticsearch hosts to connect to",
91
+ examples=["http://localhost:9200"],
92
+ )
93
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
94
+ cloud_id: Optional[str] = Field(default=None, description="id used to connect to Elastic Cloud")
95
+ api_key_id: Optional[str] = Field(
96
+ default=None,
97
+ description="id associated with api key used for authentication: "
98
+ "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501
99
+ )
100
+ ca_certs: Optional[Path] = None
101
+ access_config: Secret[ElasticsearchAccessConfig]
102
+
103
+ @field_validator("hosts", mode="before")
104
+ def to_list(cls, value):
105
+ if isinstance(value, str):
106
+ return [value]
107
+ return value
108
+
109
+ def get_client_kwargs(self) -> dict:
110
+ # Update auth related fields to conform to what the SDK expects based on the
111
+ # supported methods:
112
+ # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
113
+ client_input_kwargs: dict[str, Any] = {}
114
+ access_config = self.access_config.get_secret_value()
115
+ if self.hosts:
116
+ client_input_kwargs["hosts"] = self.hosts
117
+ if self.cloud_id:
118
+ client_input_kwargs["cloud_id"] = self.cloud_id
119
+ if self.ca_certs:
120
+ client_input_kwargs["ca_certs"] = self.ca_certs
121
+ if access_config.password and (
122
+ self.cloud_id or self.ca_certs or access_config.ssl_assert_fingerprint
123
+ ):
124
+ client_input_kwargs["basic_auth"] = ("elastic", access_config.password)
125
+ elif not self.cloud_id and self.username and access_config.password:
126
+ client_input_kwargs["basic_auth"] = (self.username, access_config.password)
127
+ elif access_config.es_api_key and self.api_key_id:
128
+ client_input_kwargs["api_key"] = (self.api_key_id, access_config.es_api_key)
129
+ elif access_config.es_api_key:
130
+ client_input_kwargs["api_key"] = access_config.es_api_key
131
+ client_input = ElasticsearchClientInput(**client_input_kwargs)
132
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
133
+ client_kwargs = client_input.model_dump()
134
+ client_kwargs["basic_auth"] = (
135
+ client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
136
+ )
137
+ client_kwargs["api_key"] = (
138
+ client_input.api_key.get_secret_value() if client_input.api_key else None
139
+ )
140
+ client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
141
+ return client_kwargs
142
+
143
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
144
+ @contextmanager
145
+ def get_client(self) -> Generator["ElasticsearchClient", None, None]:
146
+ from elasticsearch import Elasticsearch as ElasticsearchClient
147
+
148
+ with ElasticsearchClient(**self.get_client_kwargs()) as client:
149
+ yield client
150
+
151
+
152
+ class ElasticsearchIndexerConfig(IndexerConfig):
153
+ index_name: str
154
+ batch_size: int = 100
155
+
156
+
157
+ @dataclass
158
+ class ElasticsearchIndexer(Indexer):
159
+ connection_config: ElasticsearchConnectionConfig
160
+ index_config: ElasticsearchIndexerConfig
161
+ connector_type: str = CONNECTOR_TYPE
162
+
163
+ def precheck(self) -> None:
164
+ try:
165
+ with self.connection_config.get_client() as client:
166
+ indices = client.indices.get_alias(index="*")
167
+ if self.index_config.index_name not in indices:
168
+ raise SourceConnectionError(
169
+ "index {} not found: {}".format(
170
+ self.index_config.index_name, ", ".join(indices.keys())
171
+ )
172
+ )
173
+ except Exception as e:
174
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
175
+ raise SourceConnectionError(f"failed to validate connection: {e}")
176
+
177
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
178
+ def load_scan(self):
179
+ from elasticsearch.helpers import scan
180
+
181
+ return scan
182
+
183
+ def _get_doc_ids(self) -> set[str]:
184
+ """Fetches all document ids in an index"""
185
+ scan = self.load_scan()
186
+
187
+ scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
188
+ with self.connection_config.get_client() as client:
189
+ hits = scan(
190
+ client,
191
+ query=scan_query,
192
+ scroll="1m",
193
+ index=self.index_config.index_name,
194
+ )
195
+
196
+ return {hit["_id"] for hit in hits}
197
+
198
+ def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
199
+ all_ids = self._get_doc_ids()
200
+ ids = list(all_ids)
201
+ for batch in batch_generator(ids, self.index_config.batch_size):
202
+ batch_items = [BatchItem(identifier=b) for b in batch]
203
+ url = f"{self.connection_config.hosts[0]}/{self.index_config.index_name}"
204
+ display_name = (
205
+ f"url={url}, batch_size={len(batch_items)} "
206
+ f"ids={batch_items[0].identifier}..{batch_items[-1].identifier}"
207
+ ) # noqa: E501
208
+ # Make sure the hash is always a positive number to create identified
209
+ yield ElasticsearchBatchFileData(
210
+ connector_type=CONNECTOR_TYPE,
211
+ metadata=FileDataSourceMetadata(
212
+ url=url,
213
+ date_processed=str(time()),
214
+ ),
215
+ additional_metadata=ElastisearchAdditionalMetadata(
216
+ index_name=self.index_config.index_name,
217
+ ),
218
+ batch_items=batch_items,
219
+ display_name=display_name,
220
+ )
221
+
222
+
223
+ class ElasticsearchDownloaderConfig(DownloaderConfig):
224
+ fields: list[str] = field(default_factory=list)
225
+
226
+
227
+ @dataclass
228
+ class ElasticsearchDownloader(Downloader):
229
+ connection_config: ElasticsearchConnectionConfig
230
+ download_config: ElasticsearchDownloaderConfig
231
+ connector_type: str = CONNECTOR_TYPE
232
+
233
+ def is_async(self) -> bool:
234
+ return True
235
+
236
+ def get_identifier(self, index_name: str, record_id: str) -> str:
237
+ f = f"{index_name}-{record_id}"
238
+ if self.download_config.fields:
239
+ f = "{}-{}".format(
240
+ f,
241
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
242
+ )
243
+ return f
244
+
245
+ def map_es_results(self, es_results: dict) -> str:
246
+ doc_body = es_results["_source"]
247
+ flattened_dict = flatten_dict(dictionary=doc_body)
248
+ str_values = [str(value) for value in flattened_dict.values()]
249
+ concatenated_values = "\n".join(str_values)
250
+ return concatenated_values
251
+
252
+ def generate_download_response(
253
+ self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
254
+ ) -> DownloadResponse:
255
+ record_id = result["_id"]
256
+ filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
257
+ filename = f"{filename_id}.txt"
258
+ download_path = self.download_dir / Path(filename)
259
+ logger.debug(
260
+ f"Downloading results from index {index_name} and id {record_id} to {download_path}"
261
+ )
262
+ download_path.parent.mkdir(parents=True, exist_ok=True)
263
+ try:
264
+ with open(download_path, "w", encoding="utf8") as f:
265
+ f.write(self.map_es_results(es_results=result))
266
+ except Exception as e:
267
+ logger.error(
268
+ f"failed to download from index {index_name} "
269
+ f"and id {record_id} to {download_path}: {e}",
270
+ exc_info=True,
271
+ )
272
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
273
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
274
+ cast_file_data = FileData.cast(file_data=file_data)
275
+ cast_file_data.identifier = filename_id
276
+ cast_file_data.metadata.date_processed = str(time())
277
+ cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
278
+ cast_file_data.metadata.record_locator = {
279
+ "hosts": self.connection_config.hosts,
280
+ "index_name": index_name,
281
+ "document_id": record_id,
282
+ }
283
+ return super().generate_download_response(
284
+ file_data=cast_file_data,
285
+ download_path=download_path,
286
+ )
287
+
288
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
289
+ raise NotImplementedError()
290
+
291
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
292
+ def load_async(self):
293
+ from elasticsearch import AsyncElasticsearch
294
+ from elasticsearch.helpers import async_scan
295
+
296
+ return AsyncElasticsearch, async_scan
297
+
298
+ async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
299
+ elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
300
+ AsyncClient, async_scan = self.load_async()
301
+
302
+ index_name: str = elasticsearch_filedata.additional_metadata.index_name
303
+ ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
304
+
305
+ scan_query = {
306
+ "version": True,
307
+ "query": {"ids": {"values": ids}},
308
+ }
309
+
310
+ # Only add _source if fields are explicitly specified
311
+ # Omitting _source returns all fields (default behavior)
312
+ # This avoids AWS OpenSearch FGAC timeout issues with empty lists
313
+ if self.download_config.fields:
314
+ scan_query["_source"] = self.download_config.fields
315
+
316
+ download_responses = []
317
+ async with AsyncClient(**self.connection_config.get_client_kwargs()) as client:
318
+ async for result in async_scan(
319
+ client,
320
+ query=scan_query,
321
+ scroll="1m",
322
+ index=index_name,
323
+ ):
324
+ download_responses.append(
325
+ self.generate_download_response(
326
+ result=result, index_name=index_name, file_data=elasticsearch_filedata
327
+ )
328
+ )
329
+ return download_responses
330
+
331
+
332
+ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
333
+ index_name: str = Field(
334
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
335
+ )
336
+
337
+
338
+ @dataclass
339
+ class ElasticsearchUploadStager(UploadStager):
340
+ upload_stager_config: ElasticsearchUploadStagerConfig
341
+
342
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
343
+ data = element_dict.copy()
344
+ # when _op_type is not specified, it defaults to "index":
345
+ # Overwrites if exists, creates if not.
346
+ resp = {
347
+ "_index": self.upload_stager_config.index_name,
348
+ "_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
349
+ "_source": {
350
+ "element_id": data.pop("element_id", None),
351
+ "embeddings": data.pop("embeddings", None),
352
+ "text": data.pop("text", None),
353
+ "type": data.pop("type", None),
354
+ RECORD_ID_LABEL: file_data.identifier,
355
+ },
356
+ }
357
+ if "metadata" in data and isinstance(data["metadata"], dict):
358
+ resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
359
+ return resp
360
+
361
+
362
+ class ElasticsearchUploaderConfig(UploaderConfig):
363
+ index_name: str = Field(
364
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
365
+ )
366
+ batch_size_bytes: int = Field(
367
+ default=15_000_000,
368
+ description="Size limit (in bytes) for each batch of items to be uploaded. Check"
369
+ " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
370
+ "#_how_big_is_too_big for more information.",
371
+ )
372
+ num_threads: int = Field(
373
+ default=4, description="Number of threads to be used while uploading content"
374
+ )
375
+ record_id_key: str = Field(
376
+ default=RECORD_ID_LABEL,
377
+ description="searchable key to find entries for the same record on previous runs",
378
+ )
379
+
380
+
381
+ @dataclass
382
+ class ElasticsearchUploader(Uploader):
383
+ connector_type: str = CONNECTOR_TYPE
384
+ upload_config: ElasticsearchUploaderConfig
385
+ connection_config: ElasticsearchConnectionConfig
386
+
387
+ def precheck(self) -> None:
388
+ try:
389
+ with self.connection_config.get_client() as client:
390
+ indices = client.indices.get_alias(index="*")
391
+ if self.upload_config.index_name not in indices:
392
+ raise DestinationConnectionError(
393
+ "index {} not found: {}".format(
394
+ self.upload_config.index_name, ", ".join(indices.keys())
395
+ )
396
+ )
397
+ except Exception as e:
398
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
399
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
400
+
401
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
402
+ def load_parallel_bulk(self):
403
+ from elasticsearch.helpers import parallel_bulk
404
+
405
+ return parallel_bulk
406
+
407
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
408
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None: # noqa: E501
409
+ from elasticsearch.helpers.errors import BulkIndexError
410
+
411
+ parallel_bulk = self.load_parallel_bulk()
412
+ upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
413
+
414
+ logger.info(
415
+ f"writing {len(data)} elements via document batches to destination "
416
+ f"index named {self.upload_config.index_name} at {upload_destination} with "
417
+ f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
418
+ f"{self.upload_config.num_threads} (number of) threads"
419
+ )
420
+
421
+ with self.connection_config.get_client() as client:
422
+ if not client.indices.exists(index=self.upload_config.index_name):
423
+ logger.warning(
424
+ f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
425
+ f"{self.upload_config.index_name}. "
426
+ f"This may cause issues when uploading."
427
+ )
428
+ for batch in generator_batching_wbytes(
429
+ data, batch_size_limit_bytes=self.upload_config.batch_size_bytes
430
+ ):
431
+ try:
432
+ iterator = parallel_bulk(
433
+ client=client,
434
+ actions=batch,
435
+ thread_count=self.upload_config.num_threads,
436
+ )
437
+ collections.deque(iterator, maxlen=0)
438
+ logger.info(
439
+ f"uploaded batch of {len(batch)} elements to index "
440
+ f"{self.upload_config.index_name}"
441
+ )
442
+ except BulkIndexError as e:
443
+ sanitized_errors = [
444
+ self._sanitize_bulk_index_error(error) for error in e.errors
445
+ ]
446
+ logger.error(
447
+ f"Batch upload failed - {e} - with following errors: {sanitized_errors}"
448
+ )
449
+ raise DestinationConnectionError(str(e))
450
+ except Exception as e:
451
+ logger.error(f"Batch upload failed - {e}")
452
+ raise UnstructuredIngestError(str(e))
453
+
454
+ def _sanitize_bulk_index_error(self, error: dict[str, dict]) -> dict:
455
+ """Remove data uploaded to index from the log, leave only error information.
456
+
457
+ Error structure is `{<operation-type>: {..., "data": <uploaded-object>}}`
458
+ """
459
+ for error_data in error.values():
460
+ error_data.pop("data", None)
461
+ return error
462
+
463
+
464
+ elasticsearch_source_entry = SourceRegistryEntry(
465
+ connection_config=ElasticsearchConnectionConfig,
466
+ indexer=ElasticsearchIndexer,
467
+ indexer_config=ElasticsearchIndexerConfig,
468
+ downloader=ElasticsearchDownloader,
469
+ downloader_config=ElasticsearchDownloaderConfig,
470
+ )
471
+
472
+ elasticsearch_destination_entry = DestinationRegistryEntry(
473
+ connection_config=ElasticsearchConnectionConfig,
474
+ upload_stager_config=ElasticsearchUploadStagerConfig,
475
+ upload_stager=ElasticsearchUploadStager,
476
+ uploader_config=ElasticsearchUploaderConfig,
477
+ uploader=ElasticsearchUploader,
478
+ )