unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,151 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from datetime import timedelta
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.error import DestinationConnectionError
10
+ from unstructured_ingest.utils.data_prep import batch_generator
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ UploadContent,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from couchbase.cluster import Cluster
28
+
29
+ CONNECTOR_TYPE = "couchbase"
30
+ SERVER_API_VERSION = "1"
31
+
32
+
33
+ class CouchbaseAccessConfig(AccessConfig):
34
+ password: str = Field(description="The password for the Couchbase server")
35
+
36
+
37
+ class CouchbaseConnectionConfig(ConnectionConfig):
38
+ username: str = Field(description="The username for the Couchbase server")
39
+ bucket: str = Field(description="The bucket to connect to on the Couchbase server")
40
+ connection_string: str = Field(
41
+ default="couchbase://localhost", description="The connection string of the Couchbase server"
42
+ )
43
+ scope: str = Field(
44
+ default="_default", description="The scope to connect to on the Couchbase server"
45
+ )
46
+ collection: str = Field(
47
+ default="_default", description="The collection to connect to on the Couchbase server"
48
+ )
49
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
+ access_config: Secret[CouchbaseAccessConfig]
51
+
52
+
53
+ class CouchbaseUploadStagerConfig(UploadStagerConfig):
54
+ pass
55
+
56
+
57
+ @dataclass
58
+ class CouchbaseUploadStager(UploadStager):
59
+ upload_stager_config: CouchbaseUploadStagerConfig = field(
60
+ default_factory=lambda: CouchbaseUploadStagerConfig()
61
+ )
62
+
63
+ def run(
64
+ self,
65
+ elements_filepath: Path,
66
+ output_dir: Path,
67
+ output_filename: str,
68
+ **kwargs: Any,
69
+ ) -> Path:
70
+ with open(elements_filepath) as elements_file:
71
+ elements_contents = json.load(elements_file)
72
+
73
+ output_elements = []
74
+ for element in elements_contents:
75
+ new_doc = {
76
+ element["element_id"]: {
77
+ "embedding": element.get("embeddings", None),
78
+ "text": element.get("text", None),
79
+ "metadata": element.get("metadata", None),
80
+ "type": element.get("type", None),
81
+ }
82
+ }
83
+ output_elements.append(new_doc)
84
+
85
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
86
+ with open(output_path, "w") as output_file:
87
+ json.dump(output_elements, output_file)
88
+ return output_path
89
+
90
+
91
+ class CouchbaseUploaderConfig(UploaderConfig):
92
+ batch_size: int = Field(default=50, description="Number of documents to upload per batch")
93
+
94
+
95
+ @dataclass
96
+ class CouchbaseUploader(Uploader):
97
+ connection_config: CouchbaseConnectionConfig
98
+ upload_config: CouchbaseUploaderConfig
99
+ connector_type: str = CONNECTOR_TYPE
100
+
101
+ @requires_dependencies(["couchbase"], extras="couchbase")
102
+ def connect_to_couchbase(self) -> "Cluster":
103
+ from couchbase.auth import PasswordAuthenticator
104
+ from couchbase.cluster import Cluster
105
+ from couchbase.options import ClusterOptions
106
+
107
+ connection_string = self.connection_config.connection_string
108
+ username = self.connection_config.username
109
+ password = self.connection_config.access_config.get_secret_value().password
110
+
111
+ auth = PasswordAuthenticator(username, password)
112
+ options = ClusterOptions(auth)
113
+ options.apply_profile("wan_development")
114
+ cluster = Cluster(connection_string, options)
115
+ cluster.wait_until_ready(timedelta(seconds=5))
116
+ return cluster
117
+
118
+ def precheck(self) -> None:
119
+ try:
120
+ self.connect_to_couchbase()
121
+ except Exception as e:
122
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
123
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
124
+
125
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
126
+ elements = []
127
+ for content in contents:
128
+ with open(content.path) as elements_file:
129
+ elements.extend(json.load(elements_file))
130
+
131
+ logger.info(
132
+ f"writing {len(elements)} objects to destination "
133
+ f"bucket, {self.connection_config.bucket} "
134
+ f"at {self.connection_config.connection_string}",
135
+ )
136
+ cluster = self.connect_to_couchbase()
137
+ bucket = cluster.bucket(self.connection_config.bucket)
138
+ scope = bucket.scope(self.connection_config.scope)
139
+ collection = scope.collection(self.connection_config.collection)
140
+
141
+ for chunk in batch_generator(elements, self.upload_config.batch_size):
142
+ collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
143
+
144
+
145
+ couchbase_destination_entry = DestinationRegistryEntry(
146
+ connection_config=CouchbaseConnectionConfig,
147
+ uploader=CouchbaseUploader,
148
+ uploader_config=CouchbaseUploaderConfig,
149
+ upload_stager=CouchbaseUploadStager,
150
+ upload_stager_config=CouchbaseUploadStagerConfig,
151
+ )
@@ -1,8 +1,10 @@
1
1
  import os
2
- from dataclasses import dataclass, field
2
+ from dataclasses import dataclass
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
- from unstructured_ingest.enhanced_dataclass import enhanced_field
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.error import DestinationConnectionError
6
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
7
9
  from unstructured_ingest.v2.interfaces import (
8
10
  AccessConfig,
@@ -11,6 +13,7 @@ from unstructured_ingest.v2.interfaces import (
11
13
  Uploader,
12
14
  UploaderConfig,
13
15
  )
16
+ from unstructured_ingest.v2.logger import logger
14
17
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
15
18
 
16
19
  if TYPE_CHECKING:
@@ -19,45 +22,99 @@ if TYPE_CHECKING:
19
22
  CONNECTOR_TYPE = "databricks_volumes"
20
23
 
21
24
 
22
- @dataclass
23
25
  class DatabricksVolumesAccessConfig(AccessConfig):
24
- account_id: Optional[str] = None
25
- username: Optional[str] = None
26
- password: Optional[str] = None
27
- client_id: Optional[str] = None
28
- client_secret: Optional[str] = None
29
- token: Optional[str] = None
26
+ account_id: Optional[str] = Field(
27
+ default=None,
28
+ description="The Databricks account ID for the Databricks "
29
+ "accounts endpoint. Only has effect when Host is "
30
+ "either https://accounts.cloud.databricks.com/ (AWS), "
31
+ "https://accounts.azuredatabricks.net/ (Azure), "
32
+ "or https://accounts.gcp.databricks.com/ (GCP).",
33
+ )
34
+ username: Optional[str] = Field(
35
+ default=None,
36
+ description="The Databricks username part of basic authentication. "
37
+ "Only possible when Host is *.cloud.databricks.com (AWS).",
38
+ )
39
+ password: Optional[str] = Field(
40
+ default=None,
41
+ description="The Databricks password part of basic authentication. "
42
+ "Only possible when Host is *.cloud.databricks.com (AWS).",
43
+ )
44
+ client_id: Optional[str] = Field(default=None)
45
+ client_secret: Optional[str] = Field(default=None)
46
+ token: Optional[str] = Field(
47
+ default=None,
48
+ description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
49
+ "Azure Active Directory (Azure AD) token (Azure).",
50
+ )
30
51
  profile: Optional[str] = None
31
- azure_workspace_resource_id: Optional[str] = None
32
- azure_client_secret: Optional[str] = None
33
- azure_client_id: Optional[str] = None
34
- azure_tenant_id: Optional[str] = None
35
- azure_environment: Optional[str] = None
36
- auth_type: Optional[str] = None
52
+ azure_workspace_resource_id: Optional[str] = Field(
53
+ default=None,
54
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
55
+ "which is exchanged for a Databricks host URL.",
56
+ )
57
+ azure_client_secret: Optional[str] = Field(
58
+ default=None, description="The Azure AD service principal’s client secret."
59
+ )
60
+ azure_client_id: Optional[str] = Field(
61
+ default=None, description="The Azure AD service principal’s application ID."
62
+ )
63
+ azure_tenant_id: Optional[str] = Field(
64
+ default=None, description="The Azure AD service principal’s tenant ID."
65
+ )
66
+ azure_environment: Optional[str] = Field(
67
+ default=None,
68
+ description="The Azure environment type for a " "specific set of API endpoints",
69
+ examples=["Public", "UsGov", "China", "Germany"],
70
+ )
71
+ auth_type: Optional[str] = Field(
72
+ default=None,
73
+ description="When multiple auth attributes are available in the "
74
+ "environment, use the auth type specified by this "
75
+ "argument. This argument also holds the currently "
76
+ "selected auth.",
77
+ )
37
78
  cluster_id: Optional[str] = None
38
79
  google_credentials: Optional[str] = None
39
80
  google_service_account: Optional[str] = None
40
81
 
41
82
 
42
- @dataclass
83
+ SecretDatabricksVolumesAccessConfig = Secret[DatabricksVolumesAccessConfig]
84
+
85
+
43
86
  class DatabricksVolumesConnectionConfig(ConnectionConfig):
44
- access_config: DatabricksVolumesAccessConfig = enhanced_field(
45
- default_factory=DatabricksVolumesAccessConfig, sensitive=True
87
+ access_config: SecretDatabricksVolumesAccessConfig = Field(
88
+ default_factory=lambda: SecretDatabricksVolumesAccessConfig(
89
+ secret_value=DatabricksVolumesAccessConfig()
90
+ )
91
+ )
92
+ host: Optional[str] = Field(
93
+ default=None,
94
+ description="The Databricks host URL for either the "
95
+ "Databricks workspace endpoint or the "
96
+ "Databricks accounts endpoint.",
46
97
  )
47
- host: Optional[str] = None
48
98
 
49
99
 
50
- @dataclass
51
100
  class DatabricksVolumesUploaderConfig(UploaderConfig):
52
- volume: str
53
- catalog: str
54
- volume_path: Optional[str] = None
55
- overwrite: bool = False
56
- schema: str = "default"
101
+ volume: str = Field(description="Name of volume in the Unity Catalog")
102
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
103
+ volume_path: Optional[str] = Field(
104
+ default=None, description="Optional path within the volume to write to"
105
+ )
106
+ overwrite: bool = Field(
107
+ default=False, description="If true, an existing file will be overwritten."
108
+ )
109
+ databricks_schema: str = Field(
110
+ default="default",
111
+ alias="schema",
112
+ description="Schema associated with the volume to write to in the Unity Catalog service",
113
+ )
57
114
 
58
115
  @property
59
116
  def path(self) -> str:
60
- path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
117
+ path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
61
118
  if self.volume_path:
62
119
  path = f"{path}/{self.volume_path}"
63
120
  return path
@@ -68,21 +125,28 @@ class DatabricksVolumesUploader(Uploader):
68
125
  connector_type: str = CONNECTOR_TYPE
69
126
  upload_config: DatabricksVolumesUploaderConfig
70
127
  connection_config: DatabricksVolumesConnectionConfig
71
- client: Optional["WorkspaceClient"] = field(init=False, default=None)
72
128
 
73
129
  @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
74
- def __post_init__(self) -> "WorkspaceClient":
130
+ def get_client(self) -> "WorkspaceClient":
75
131
  from databricks.sdk import WorkspaceClient
76
132
 
77
- self.client = WorkspaceClient(
78
- host=self.connection_config.host, **self.connection_config.access_config.to_dict()
133
+ return WorkspaceClient(
134
+ host=self.connection_config.host,
135
+ **self.connection_config.access_config.get_secret_value().dict(),
79
136
  )
80
137
 
138
+ def precheck(self) -> None:
139
+ try:
140
+ assert self.get_client().current_user.me().active
141
+ except Exception as e:
142
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
143
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
144
+
81
145
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
82
146
  for content in contents:
83
147
  with open(content.path, "rb") as elements_file:
84
148
  output_path = os.path.join(self.upload_config.path, content.path.name)
85
- self.client.files.upload(
149
+ self.get_client().files.upload(
86
150
  file_path=output_path,
87
151
  contents=elements_file,
88
152
  overwrite=self.upload_config.overwrite,
@@ -5,12 +5,15 @@ import uuid
5
5
  from dataclasses import dataclass, field
6
6
  from pathlib import Path
7
7
  from time import time
8
- from typing import TYPE_CHECKING, Any, Generator, Optional
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
9
9
 
10
- from unstructured.documents.elements import DataSourceMetadata
10
+ from pydantic import BaseModel, Field, Secret, SecretStr
11
11
 
12
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
13
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.error import (
13
+ DestinationConnectionError,
14
+ SourceConnectionError,
15
+ SourceConnectionNetworkError,
16
+ )
14
17
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
15
18
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
19
  from unstructured_ingest.v2.interfaces import (
@@ -20,6 +23,7 @@ from unstructured_ingest.v2.interfaces import (
20
23
  DownloaderConfig,
21
24
  DownloadResponse,
22
25
  FileData,
26
+ FileDataSourceMetadata,
23
27
  Indexer,
24
28
  IndexerConfig,
25
29
  UploadContent,
@@ -41,57 +45,74 @@ if TYPE_CHECKING:
41
45
  CONNECTOR_TYPE = "elasticsearch"
42
46
 
43
47
 
44
- @dataclass
45
48
  class ElasticsearchAccessConfig(AccessConfig):
46
- password: Optional[str] = None
47
- api_key: Optional[str] = enhanced_field(default=None, overload_name="es_api_key")
48
- bearer_auth: Optional[str] = None
49
- ssl_assert_fingerprint: Optional[str] = None
50
-
51
-
52
- @dataclass
53
- class ElasticsearchClientInput(EnhancedDataClassJsonMixin):
49
+ password: Optional[str] = Field(
50
+ default=None, description="password when using basic auth or connecting to a cloud instance"
51
+ )
52
+ es_api_key: Optional[str] = Field(default=None, description="api key used for authentication")
53
+ bearer_auth: Optional[str] = Field(
54
+ default=None, description="bearer token used for HTTP bearer authentication"
55
+ )
56
+ ssl_assert_fingerprint: Optional[str] = Field(
57
+ default=None, description="SHA256 fingerprint value"
58
+ )
59
+
60
+
61
+ class ElasticsearchClientInput(BaseModel):
54
62
  hosts: Optional[list[str]] = None
55
63
  cloud_id: Optional[str] = None
56
- ca_certs: Optional[str] = None
57
- basic_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
58
- api_key: Optional[str] = enhanced_field(sensitive=True, default=None)
64
+ ca_certs: Optional[Path] = None
65
+ basic_auth: Optional[Secret[tuple[str, str]]] = None
66
+ api_key: Optional[Union[Secret[tuple[str, str]], SecretStr]] = None
59
67
 
60
68
 
61
- @dataclass
62
69
  class ElasticsearchConnectionConfig(ConnectionConfig):
63
- hosts: Optional[list[str]] = None
64
- username: Optional[str] = None
65
- cloud_id: Optional[str] = None
66
- api_key_id: Optional[str] = None
67
- ca_certs: Optional[str] = None
68
- access_config: ElasticsearchAccessConfig = enhanced_field(sensitive=True)
70
+ hosts: Optional[list[str]] = Field(
71
+ default=None,
72
+ description="list of the Elasticsearch hosts to connect to",
73
+ examples=["http://localhost:9200"],
74
+ )
75
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
76
+ cloud_id: Optional[str] = Field(default=None, description="id used to connect to Elastic Cloud")
77
+ api_key_id: Optional[str] = Field(
78
+ default=None,
79
+ description="id associated with api key used for authentication: "
80
+ "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501
81
+ )
82
+ ca_certs: Optional[Path] = None
83
+ access_config: Secret[ElasticsearchAccessConfig]
69
84
 
70
85
  def get_client_kwargs(self) -> dict:
71
86
  # Update auth related fields to conform to what the SDK expects based on the
72
87
  # supported methods:
73
88
  # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
74
- client_input = ElasticsearchClientInput()
89
+ client_input_kwargs: dict[str, Any] = {}
90
+ access_config = self.access_config.get_secret_value()
75
91
  if self.hosts:
76
- client_input.hosts = self.hosts
92
+ client_input_kwargs["hosts"] = self.hosts
77
93
  if self.cloud_id:
78
- client_input.cloud_id = self.cloud_id
94
+ client_input_kwargs["cloud_id"] = self.cloud_id
79
95
  if self.ca_certs:
80
- client_input.ca_certs = self.ca_certs
81
- if self.access_config.password and (
82
- self.cloud_id or self.ca_certs or self.access_config.ssl_assert_fingerprint
96
+ client_input_kwargs["ca_certs"] = self.ca_certs
97
+ if access_config.password and (
98
+ self.cloud_id or self.ca_certs or access_config.ssl_assert_fingerprint
83
99
  ):
84
- client_input.basic_auth = ("elastic", self.access_config.password)
85
- elif not self.cloud_id and self.username and self.access_config.password:
86
- client_input.basic_auth = (self.username, self.access_config.password)
87
- elif self.access_config.api_key and self.api_key_id:
88
- client_input.api_key = (self.api_key_id, self.access_config.api_key)
89
- elif self.access_config.api_key:
90
- client_input.api_key = self.access_config.api_key
91
- logger.debug(
92
- f"Elasticsearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
100
+ client_input_kwargs["basic_auth"] = ("elastic", access_config.password)
101
+ elif not self.cloud_id and self.username and access_config.password:
102
+ client_input_kwargs["basic_auth"] = (self.username, access_config.password)
103
+ elif access_config.es_api_key and self.api_key_id:
104
+ client_input_kwargs["api_key"] = (self.api_key_id, access_config.es_api_key)
105
+ elif access_config.es_api_key:
106
+ client_input_kwargs["api_key"] = access_config.es_api_key
107
+ client_input = ElasticsearchClientInput(**client_input_kwargs)
108
+ logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
109
+ client_kwargs = client_input.dict()
110
+ client_kwargs["basic_auth"] = (
111
+ client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
112
+ )
113
+ client_kwargs["api_key"] = (
114
+ client_input.api_key.get_secret_value() if client_input.api_key else None
93
115
  )
94
- client_kwargs = client_input.to_dict(redact_sensitive=False)
95
116
  client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
96
117
  return client_kwargs
97
118
 
@@ -111,7 +132,6 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
111
132
  raise SourceConnectionError(f"failed to validate connection: {e}")
112
133
 
113
134
 
114
- @dataclass
115
135
  class ElasticsearchIndexerConfig(IndexerConfig):
116
136
  index_name: str
117
137
  batch_size: int = 100
@@ -121,11 +141,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
121
141
  class ElasticsearchIndexer(Indexer):
122
142
  connection_config: ElasticsearchConnectionConfig
123
143
  index_config: ElasticsearchIndexerConfig
124
- client: "ElasticsearchClient" = field(init=False)
125
144
  connector_type: str = CONNECTOR_TYPE
126
145
 
127
- def __post_init__(self):
128
- self.client = self.connection_config.get_client()
146
+ def precheck(self) -> None:
147
+ try:
148
+ self.connection_config.get_client()
149
+ except Exception as e:
150
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
151
+ raise SourceConnectionError(f"failed to validate connection: {e}")
129
152
 
130
153
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
131
154
  def load_scan(self):
@@ -138,8 +161,9 @@ class ElasticsearchIndexer(Indexer):
138
161
  scan = self.load_scan()
139
162
 
140
163
  scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
164
+ client = self.connection_config.get_client()
141
165
  hits = scan(
142
- self.client,
166
+ client,
143
167
  query=scan_query,
144
168
  scroll="1m",
145
169
  index=self.index_config.index_name,
@@ -168,7 +192,7 @@ class ElasticsearchIndexer(Indexer):
168
192
  yield FileData(
169
193
  identifier=identified,
170
194
  connector_type=CONNECTOR_TYPE,
171
- metadata=DataSourceMetadata(
195
+ metadata=FileDataSourceMetadata(
172
196
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
173
197
  date_processed=str(time()),
174
198
  ),
@@ -179,7 +203,6 @@ class ElasticsearchIndexer(Indexer):
179
203
  )
180
204
 
181
205
 
182
- @dataclass
183
206
  class ElasticsearchDownloaderConfig(DownloaderConfig):
184
207
  fields: list[str] = field(default_factory=list)
185
208
 
@@ -234,7 +257,7 @@ class ElasticsearchDownloader(Downloader):
234
257
  file_data=FileData(
235
258
  identifier=filename_id,
236
259
  connector_type=CONNECTOR_TYPE,
237
- metadata=DataSourceMetadata(
260
+ metadata=FileDataSourceMetadata(
238
261
  version=str(result["_version"]) if "_version" in result else None,
239
262
  date_processed=str(time()),
240
263
  record_locator={
@@ -285,9 +308,10 @@ class ElasticsearchDownloader(Downloader):
285
308
  return download_responses
286
309
 
287
310
 
288
- @dataclass
289
311
  class ElasticsearchUploadStagerConfig(UploadStagerConfig):
290
- index_name: str
312
+ index_name: str = Field(
313
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
314
+ )
291
315
 
292
316
 
293
317
  @dataclass
@@ -326,11 +350,19 @@ class ElasticsearchUploadStager(UploadStager):
326
350
  return output_path
327
351
 
328
352
 
329
- @dataclass
330
353
  class ElasticsearchUploaderConfig(UploaderConfig):
331
- index_name: str
332
- batch_size_bytes: int = 15_000_000
333
- num_threads: int = 4
354
+ index_name: str = Field(
355
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
356
+ )
357
+ batch_size_bytes: int = Field(
358
+ default=15_000_000,
359
+ description="Size limit (in bytes) for each batch of items to be uploaded. Check"
360
+ " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
361
+ "#_how_big_is_too_big for more information.",
362
+ )
363
+ num_threads: int = Field(
364
+ default=4, description="Number of threads to be used while uploading content"
365
+ )
334
366
 
335
367
 
336
368
  @dataclass
@@ -339,6 +371,13 @@ class ElasticsearchUploader(Uploader):
339
371
  upload_config: ElasticsearchUploaderConfig
340
372
  connection_config: ElasticsearchConnectionConfig
341
373
 
374
+ def precheck(self) -> None:
375
+ try:
376
+ self.connection_config.get_client()
377
+ except Exception as e:
378
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
379
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
380
+
342
381
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
343
382
  def load_parallel_bulk(self):
344
383
  from elasticsearch.helpers import parallel_bulk