unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -3,9 +3,10 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
- from unstructured.__version__ import __version__ as unstructured_version
6
+ from pydantic import Field, Secret
7
7
 
8
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.__version__ import __version__ as unstructured_version
9
+ from unstructured_ingest.error import DestinationConnectionError
9
10
  from unstructured_ingest.utils.data_prep import batch_generator
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import (
@@ -30,25 +31,28 @@ CONNECTOR_TYPE = "mongodb"
30
31
  SERVER_API_VERSION = "1"
31
32
 
32
33
 
33
- @dataclass
34
34
  class MongoDBAccessConfig(AccessConfig):
35
- uri: Optional[str] = None
35
+ uri: Optional[str] = Field(default=None, description="URI to user when connecting")
36
+
37
+
38
+ SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
36
39
 
37
40
 
38
- @dataclass
39
41
  class MongoDBConnectionConfig(ConnectionConfig):
40
- access_config: MongoDBAccessConfig = enhanced_field(
41
- sensitive=True, default_factory=MongoDBAccessConfig
42
+ access_config: SecretMongoDBAccessConfig = Field(
43
+ default_factory=lambda: SecretMongoDBAccessConfig(secret_value=MongoDBAccessConfig())
42
44
  )
43
- host: Optional[str] = None
44
- database: Optional[str] = None
45
- collection: Optional[str] = None
46
- port: int = 27017
47
- batch_size: int = 100
48
- connector_type: str = CONNECTOR_TYPE
45
+ host: Optional[str] = Field(
46
+ default=None,
47
+ description="hostname or IP address or Unix domain socket path of a single mongod or "
48
+ "mongos instance to connect to, or a list of hostnames",
49
+ )
50
+ database: Optional[str] = Field(default=None, description="database name to connect to")
51
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
52
+ port: int = Field(default=27017)
53
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
49
54
 
50
55
 
51
- @dataclass
52
56
  class MongoDBUploadStagerConfig(UploadStagerConfig):
53
57
  pass
54
58
 
@@ -76,20 +80,23 @@ class MongoDBUploadStager(UploadStager):
76
80
  return output_path
77
81
 
78
82
 
79
- @dataclass
80
83
  class MongoDBUploaderConfig(UploaderConfig):
81
- batch_size: int = 100
84
+ batch_size: int = Field(default=100, description="Number of records per batch")
82
85
 
83
86
 
84
87
  @dataclass
85
88
  class MongoDBUploader(Uploader):
86
89
  upload_config: MongoDBUploaderConfig
87
90
  connection_config: MongoDBConnectionConfig
88
- client: Optional["MongoClient"] = field(init=False)
89
91
  connector_type: str = CONNECTOR_TYPE
90
92
 
91
- def __post_init__(self):
92
- self.client = self.create_client()
93
+ def precheck(self) -> None:
94
+ try:
95
+ client = self.create_client()
96
+ client.admin.command("ping")
97
+ except Exception as e:
98
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
99
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
93
100
 
94
101
  @requires_dependencies(["pymongo"], extras="mongodb")
95
102
  def create_client(self) -> "MongoClient":
@@ -97,9 +104,11 @@ class MongoDBUploader(Uploader):
97
104
  from pymongo.driver_info import DriverInfo
98
105
  from pymongo.server_api import ServerApi
99
106
 
100
- if self.connection_config.access_config.uri:
107
+ access_config = self.connection_config.access_config.get_secret_value()
108
+
109
+ if access_config.uri:
101
110
  return MongoClient(
102
- self.connection_config.access_config.uri,
111
+ access_config.uri,
103
112
  server_api=ServerApi(version=SERVER_API_VERSION),
104
113
  driver=DriverInfo(name="unstructured", version=unstructured_version),
105
114
  )
@@ -123,7 +132,8 @@ class MongoDBUploader(Uploader):
123
132
  f"collection {self.connection_config.collection} "
124
133
  f"at {self.connection_config.host}",
125
134
  )
126
- db = self.client[self.connection_config.database]
135
+ client = self.create_client()
136
+ db = client[self.connection_config.database]
127
137
  collection = db[self.connection_config.collection]
128
138
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
129
139
  collection.insert_many(chunk)
@@ -1,13 +1,12 @@
1
1
  import json
2
- from dataclasses import dataclass, field
2
+ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from time import time
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
8
+ from pydantic import Field, Secret
9
9
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
10
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
12
  from unstructured_ingest.v2.interfaces import (
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
17
16
  DownloaderConfig,
18
17
  DownloadResponse,
19
18
  FileData,
19
+ FileDataSourceMetadata,
20
20
  Indexer,
21
21
  IndexerConfig,
22
22
  SourceIdentifiers,
@@ -35,18 +35,23 @@ CONNECTOR_TYPE = "onedrive"
35
35
  MAX_MB_SIZE = 512_000_000
36
36
 
37
37
 
38
- @dataclass
39
38
  class OnedriveAccessConfig(AccessConfig):
40
- client_cred: str
39
+ client_cred: str = Field(description="Microsoft App client secret")
41
40
 
42
41
 
43
- @dataclass
44
42
  class OnedriveConnectionConfig(ConnectionConfig):
45
- client_id: str
46
- user_pname: str
47
- tenant: str = field(repr=False)
48
- authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
49
- access_config: OnedriveAccessConfig = enhanced_field(sensitive=True)
43
+ client_id: str = Field(description="Microsoft app client ID")
44
+ user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
45
+ tenant: str = Field(
46
+ repr=False, description="ID or domain name associated with your Azure AD instance"
47
+ )
48
+ authority_url: Optional[str] = Field(
49
+ repr=False,
50
+ default="https://login.microsoftonline.com",
51
+ examples=["https://login.microsoftonline.com"],
52
+ description="Authentication token provider for Microsoft apps",
53
+ )
54
+ access_config: Secret[OnedriveAccessConfig]
50
55
 
51
56
  @requires_dependencies(["msal"], extras="onedrive")
52
57
  def get_token(self):
@@ -56,7 +61,7 @@ class OnedriveConnectionConfig(ConnectionConfig):
56
61
  app = ConfidentialClientApplication(
57
62
  authority=f"{self.authority_url}/{self.tenant}",
58
63
  client_id=self.client_id,
59
- client_credential=self.access_config.client_cred,
64
+ client_credential=self.access_config.get_secret_value().client_cred,
60
65
  )
61
66
  token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
62
67
  except ValueError as exc:
@@ -76,9 +81,8 @@ class OnedriveConnectionConfig(ConnectionConfig):
76
81
  return client
77
82
 
78
83
 
79
- @dataclass
80
84
  class OnedriveIndexerConfig(IndexerConfig):
81
- path: Optional[str] = field(default="")
85
+ path: Optional[str] = Field(default="")
82
86
  recursive: bool = False
83
87
 
84
88
 
@@ -87,6 +91,18 @@ class OnedriveIndexer(Indexer):
87
91
  connection_config: OnedriveConnectionConfig
88
92
  index_config: OnedriveIndexerConfig
89
93
 
94
+ def precheck(self) -> None:
95
+ try:
96
+ token_resp: dict = self.connection_config.get_token()
97
+ if error := token_resp.get("error"):
98
+ raise SourceConnectionError(
99
+ "{} ({})".format(error, token_resp.get("error_description"))
100
+ )
101
+ self.connection_config.get_client()
102
+ except Exception as e:
103
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
104
+ raise SourceConnectionError(f"failed to validate connection: {e}")
105
+
90
106
  def list_objects(self, folder, recursive) -> list["DriveItem"]:
91
107
  drive_items = folder.children.get().execute_query()
92
108
  files = [d for d in drive_items if d.is_file]
@@ -136,7 +152,7 @@ class OnedriveIndexer(Indexer):
136
152
  source_identifiers=SourceIdentifiers(
137
153
  fullpath=server_path, filename=drive_item.name, rel_path=rel_path
138
154
  ),
139
- metadata=DataSourceMetadata(
155
+ metadata=FileDataSourceMetadata(
140
156
  url=drive_item.parent_reference.path + "/" + drive_item.name,
141
157
  version=drive_item.etag,
142
158
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -159,7 +175,6 @@ class OnedriveIndexer(Indexer):
159
175
  yield file_data
160
176
 
161
177
 
162
- @dataclass
163
178
  class OnedriveDownloaderConfig(DownloaderConfig):
164
179
  pass
165
180
 
@@ -1,7 +1,9 @@
1
1
  from dataclasses import dataclass, field
2
+ from pathlib import Path
2
3
  from typing import TYPE_CHECKING, Optional
3
4
 
4
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
5
+ from pydantic import BaseModel, Field, Secret
6
+
5
7
  from unstructured_ingest.error import (
6
8
  DestinationConnectionError,
7
9
  )
@@ -35,20 +37,28 @@ CONNECTOR_TYPE = "opensearch"
35
37
  heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
36
38
 
37
39
 
38
- @dataclass
39
40
  class OpenSearchAccessConfig(AccessConfig):
40
- password: Optional[str] = enhanced_field(default=None, sensitive=True)
41
- use_ssl: bool = False
42
- verify_certs: bool = False
43
- ssl_show_warn: bool = False
44
- ca_certs: Optional[str] = None
45
- client_cert: Optional[str] = None
46
- client_key: Optional[str] = None
47
-
48
-
49
- @dataclass
50
- class OpenSearchClientInput(EnhancedDataClassJsonMixin):
51
- http_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
41
+ password: Optional[str] = Field(default=None, description="password when using basic auth")
42
+ use_ssl: bool = Field(default=False, description="use ssl for the connection")
43
+ verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
44
+ ssl_show_warn: bool = Field(
45
+ default=False, description="show warning when verify certs is disabled"
46
+ )
47
+ ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
48
+ client_cert: Optional[Path] = Field(
49
+ default=None,
50
+ description="path to the file containing the private key and the certificate,"
51
+ " or cert only if using client_key",
52
+ )
53
+ client_key: Optional[Path] = Field(
54
+ default=None,
55
+ description="path to the file containing the private key"
56
+ " if using separate cert and key files",
57
+ )
58
+
59
+
60
+ class OpenSearchClientInput(BaseModel):
61
+ http_auth: Secret[Optional[tuple[str, str]]] = None
52
62
  hosts: Optional[list[str]] = None
53
63
  use_ssl: bool = False
54
64
  verify_certs: bool = False
@@ -58,37 +68,41 @@ class OpenSearchClientInput(EnhancedDataClassJsonMixin):
58
68
  client_key: Optional[str] = None
59
69
 
60
70
 
61
- @dataclass
62
71
  class OpenSearchConnectionConfig(ConnectionConfig):
63
- hosts: Optional[list[str]] = None
64
- username: Optional[str] = None
65
- access_config: OpenSearchAccessConfig = enhanced_field(sensitive=True)
72
+ hosts: Optional[list[str]] = Field(
73
+ default=None,
74
+ description="List of the OpenSearch hosts to connect",
75
+ examples=["http://localhost:9200"],
76
+ )
77
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
78
+ access_config: Secret[OpenSearchAccessConfig]
66
79
 
67
80
  def get_client_kwargs(self) -> dict:
68
81
  # Update auth related fields to conform to what the SDK expects based on the
69
82
  # supported methods:
70
83
  # https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
71
- client_input = OpenSearchClientInput()
84
+ access_config = self.access_config.get_secret_value()
85
+ client_input_kwargs = {}
72
86
  if self.hosts:
73
- client_input.hosts = self.hosts
74
- if self.access_config.use_ssl:
75
- client_input.use_ssl = self.access_config.use_ssl
76
- if self.access_config.verify_certs:
77
- client_input.verify_certs = self.access_config.verify_certs
78
- if self.access_config.ssl_show_warn:
79
- client_input.ssl_show_warn = self.access_config.ssl_show_warn
80
- if self.access_config.ca_certs:
81
- client_input.ca_certs = self.access_config.ca_certs
82
- if self.access_config.client_cert:
83
- client_input.client_cert = self.access_config.client_cert
84
- if self.access_config.client_key:
85
- client_input.client_key = self.access_config.client_key
86
- if self.username and self.access_config.password:
87
- client_input.http_auth = (self.username, self.access_config.password)
88
- logger.debug(
89
- f"OpenSearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
90
- )
91
- client_kwargs = client_input.to_dict(redact_sensitive=False)
87
+ client_input_kwargs["hosts"] = self.hosts
88
+ if access_config.use_ssl:
89
+ client_input_kwargs["use_ssl"] = access_config.use_ssl
90
+ if access_config.verify_certs:
91
+ client_input_kwargs["verify_certs"] = access_config.verify_certs
92
+ if access_config.ssl_show_warn:
93
+ client_input_kwargs["ssl_show_warn"] = access_config.ssl_show_warn
94
+ if access_config.ca_certs:
95
+ client_input_kwargs["ca_certs"] = str(access_config.ca_certs)
96
+ if access_config.client_cert:
97
+ client_input_kwargs["client_cert"] = str(access_config.client_cert)
98
+ if access_config.client_key:
99
+ client_input_kwargs["client_key"] = str(access_config.client_key)
100
+ if self.username and access_config.password:
101
+ client_input_kwargs["http_auth"] = (self.username, access_config.password)
102
+ client_input = OpenSearchClientInput(**client_input_kwargs)
103
+ logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
104
+ client_kwargs = client_input.dict()
105
+ client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
92
106
  client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
93
107
  return client_kwargs
94
108
 
@@ -100,9 +114,14 @@ class OpenSearchConnectionConfig(ConnectionConfig):
100
114
  return OpenSearch(**self.get_client_kwargs())
101
115
 
102
116
 
117
+ class OpenSearchIndexerConfig(ElasticsearchIndexerConfig):
118
+ pass
119
+
120
+
103
121
  @dataclass
104
122
  class OpenSearchIndexer(ElasticsearchIndexer):
105
123
  connection_config: OpenSearchConnectionConfig
124
+ index_config: OpenSearchIndexerConfig
106
125
  client: "OpenSearch" = field(init=False)
107
126
 
108
127
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -112,9 +131,14 @@ class OpenSearchIndexer(ElasticsearchIndexer):
112
131
  return scan
113
132
 
114
133
 
134
+ class OpenSearchDownloaderConfig(ElasticsearchDownloaderConfig):
135
+ pass
136
+
137
+
115
138
  @dataclass
116
139
  class OpenSearchDownloader(ElasticsearchDownloader):
117
140
  connection_config: OpenSearchConnectionConfig
141
+ download_config: OpenSearchDownloaderConfig
118
142
  connector_type: str = CONNECTOR_TYPE
119
143
 
120
144
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -125,9 +149,14 @@ class OpenSearchDownloader(ElasticsearchDownloader):
125
149
  return AsyncOpenSearch, async_scan
126
150
 
127
151
 
152
+ class OpenSearchUploaderConfig(ElasticsearchUploaderConfig):
153
+ pass
154
+
155
+
128
156
  @dataclass
129
157
  class OpenSearchUploader(ElasticsearchUploader):
130
158
  connection_config: OpenSearchConnectionConfig
159
+ upload_config: OpenSearchUploaderConfig
131
160
  connector_type: str = CONNECTOR_TYPE
132
161
 
133
162
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -137,19 +166,28 @@ class OpenSearchUploader(ElasticsearchUploader):
137
166
  return parallel_bulk
138
167
 
139
168
 
169
+ class OpenSearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
170
+ pass
171
+
172
+
173
+ @dataclass
174
+ class OpenSearchUploadStager(ElasticsearchUploadStager):
175
+ upload_stager_config: OpenSearchUploadStagerConfig
176
+
177
+
140
178
  opensearch_source_entry = SourceRegistryEntry(
141
179
  connection_config=OpenSearchConnectionConfig,
142
180
  indexer=OpenSearchIndexer,
143
- indexer_config=ElasticsearchIndexerConfig,
181
+ indexer_config=OpenSearchIndexerConfig,
144
182
  downloader=OpenSearchDownloader,
145
- downloader_config=ElasticsearchDownloaderConfig,
183
+ downloader_config=OpenSearchDownloaderConfig,
146
184
  )
147
185
 
148
186
 
149
187
  opensearch_destination_entry = DestinationRegistryEntry(
150
188
  connection_config=OpenSearchConnectionConfig,
151
- upload_stager_config=ElasticsearchUploadStagerConfig,
152
- upload_stager=ElasticsearchUploadStager,
153
- uploader_config=ElasticsearchUploaderConfig,
189
+ upload_stager_config=OpenSearchUploadStagerConfig,
190
+ upload_stager=OpenSearchUploadStager,
191
+ uploader_config=OpenSearchUploaderConfig,
154
192
  uploader=OpenSearchUploader,
155
193
  )
@@ -5,12 +5,11 @@ from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, Optional
7
7
 
8
- from unstructured.staging.base import flatten_dict
9
- from unstructured.utils import requires_dependencies
8
+ from pydantic import Field, Secret
10
9
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
12
10
  from unstructured_ingest.error import DestinationConnectionError
13
- from unstructured_ingest.utils.data_prep import batch_generator
11
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
13
  from unstructured_ingest.v2.interfaces import (
15
14
  AccessConfig,
16
15
  ConnectionConfig,
@@ -32,25 +31,31 @@ if TYPE_CHECKING:
32
31
  CONNECTOR_TYPE = "pinecone"
33
32
 
34
33
 
35
- @dataclass
36
34
  class PineconeAccessConfig(AccessConfig):
37
- api_key: Optional[str] = enhanced_field(default=None, overload_name="pinecone_api_key")
35
+ pinecone_api_key: Optional[str] = Field(
36
+ default=None, description="API key for Pinecone.", alias="api_key"
37
+ )
38
+
39
+
40
+ SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
38
41
 
39
42
 
40
- @dataclass
41
43
  class PineconeConnectionConfig(ConnectionConfig):
42
- index_name: str
43
- environment: str
44
- access_config: PineconeAccessConfig = enhanced_field(sensitive=True)
44
+ index_name: str = Field(description="Name of the index to connect to.")
45
+ environment: str = Field(description="Environment to connect to.")
46
+ access_config: SecretPineconeAccessConfig = Field(
47
+ default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
48
+ )
45
49
 
46
50
  @requires_dependencies(["pinecone"], extras="pinecone")
47
51
  def get_index(self) -> "PineconeIndex":
48
52
  from pinecone import Pinecone
49
- from unstructured import __version__ as unstructured_version
53
+
54
+ from unstructured_ingest import __version__ as unstructured_version
50
55
 
51
56
  pc = Pinecone(
52
- api_key=self.access_config.api_key,
53
- source_tag=f"unstructured=={unstructured_version}",
57
+ api_key=self.access_config.get_secret_value().pinecone_api_key,
58
+ source_tag=f"unstructured_ingest=={unstructured_version}",
54
59
  )
55
60
 
56
61
  index = pc.Index(self.index_name)
@@ -58,15 +63,13 @@ class PineconeConnectionConfig(ConnectionConfig):
58
63
  return index
59
64
 
60
65
 
61
- @dataclass
62
66
  class PineconeUploadStagerConfig(UploadStagerConfig):
63
67
  pass
64
68
 
65
69
 
66
- @dataclass
67
70
  class PineconeUploaderConfig(UploaderConfig):
68
- batch_size: int = 100
69
- num_of_processes: int = 4
71
+ batch_size: int = Field(default=100, description="Number of records per batch")
72
+ num_processes: int = Field(default=4, description="Number of processes to use for uploading")
70
73
 
71
74
 
72
75
  @dataclass
@@ -123,9 +126,12 @@ class PineconeUploader(Uploader):
123
126
  connection_config: PineconeConnectionConfig
124
127
  connector_type: str = CONNECTOR_TYPE
125
128
 
126
- @DestinationConnectionError.wrap
127
- def check_connection(self):
128
- _ = self.connection_config.get_index()
129
+ def precheck(self):
130
+ try:
131
+ self.connection_config.get_index()
132
+ except Exception as e:
133
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
134
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
129
135
 
130
136
  @requires_dependencies(["pinecone"], extras="pinecone")
131
137
  def upsert_batch(self, batch):
@@ -151,18 +157,18 @@ class PineconeUploader(Uploader):
151
157
  f" index named {self.connection_config.index_name}"
152
158
  f" environment named {self.connection_config.environment}"
153
159
  f" with batch size {self.upload_config.batch_size}"
154
- f" with {self.upload_config.num_of_processes} (number of) processes"
160
+ f" with {self.upload_config.num_processes} (number of) processes"
155
161
  )
156
162
 
157
163
  pinecone_batch_size = self.upload_config.batch_size
158
164
 
159
- if self.upload_config.num_of_processes == 1:
165
+ if self.upload_config.num_processes == 1:
160
166
  for batch in batch_generator(elements_dict, pinecone_batch_size):
161
167
  self.upsert_batch(batch) # noqa: E203
162
168
 
163
169
  else:
164
170
  with mp.Pool(
165
- processes=self.upload_config.num_of_processes,
171
+ processes=self.upload_config.num_processes,
166
172
  ) as pool:
167
173
  pool.map(
168
174
  self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
@@ -15,13 +15,12 @@ from email.utils import formatdate
15
15
  from pathlib import Path
16
16
  from string import Template
17
17
  from textwrap import dedent
18
- from typing import TYPE_CHECKING, Any, Generator, Type
18
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Type
19
19
 
20
20
  from dateutil import parser
21
- from unstructured.documents.elements import DataSourceMetadata
21
+ from pydantic import Field, Secret
22
22
 
23
- from unstructured_ingest.enhanced_dataclass import enhanced_field
24
- from unstructured_ingest.error import SourceConnectionNetworkError
23
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
25
24
  from unstructured_ingest.utils.dep_check import requires_dependencies
26
25
  from unstructured_ingest.v2.interfaces import (
27
26
  AccessConfig,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
30
29
  DownloaderConfig,
31
30
  DownloadResponse,
32
31
  FileData,
32
+ FileDataSourceMetadata,
33
33
  Indexer,
34
34
  IndexerConfig,
35
35
  SourceIdentifiers,
@@ -75,49 +75,58 @@ $htmlbody
75
75
  )
76
76
 
77
77
 
78
- @dataclass
79
78
  class SalesforceAccessConfig(AccessConfig):
80
79
  consumer_key: str
81
- private_key: str
80
+ private_key_path: Optional[Path] = Field(
81
+ default=None,
82
+ description="Path to the private key file. " "Key file is usually named server.key.",
83
+ )
84
+ private_key: Optional[str] = Field(default=None, description="Contents of the private key")
85
+
86
+ def model_post_init(self, __context: Any) -> None:
87
+ if self.private_key_path is None and self.private_key is None:
88
+ raise ValueError("either private_key or private_key_path must be set")
89
+ if self.private_key is not None and self.private_key_path is not None:
90
+ raise ValueError("only one of private_key or private_key_path must be set")
82
91
 
83
92
  @requires_dependencies(["cryptography"])
84
93
  def get_private_key_value_and_type(self) -> tuple[str, Type]:
85
94
  from cryptography.hazmat.primitives import serialization
86
95
 
87
- try:
88
- serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
89
- except ValueError:
90
- pass
91
- else:
96
+ if self.private_key_path and self.private_key_path.is_file():
97
+ return str(self.private_key_path), Path
98
+ if self.private_key:
99
+ try:
100
+ serialization.load_pem_private_key(
101
+ data=str(self.private_key).encode("utf-8"), password=None
102
+ )
103
+ except Exception as e:
104
+ raise ValueError(f"failed to validate private key data: {e}") from e
92
105
  return self.private_key, str
93
106
 
94
- if Path(self.private_key).is_file():
95
- return self.private_key, Path
96
-
97
107
  raise ValueError("private_key does not contain PEM private key or path")
98
108
 
99
109
 
100
- @dataclass
101
110
  class SalesforceConnectionConfig(ConnectionConfig):
102
111
  username: str
103
- access_config: SalesforceAccessConfig = enhanced_field(sensitive=True)
112
+ access_config: Secret[SalesforceAccessConfig]
104
113
 
105
114
  @requires_dependencies(["simple_salesforce"], extras="salesforce")
106
115
  def get_client(self) -> "Salesforce":
107
116
  from simple_salesforce import Salesforce
108
117
 
109
- pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
118
+ access_config = self.access_config.get_secret_value()
119
+ pkey_value, pkey_type = access_config.get_private_key_value_and_type()
110
120
 
111
121
  return Salesforce(
112
122
  username=self.username,
113
- consumer_key=self.access_config.consumer_key,
123
+ consumer_key=access_config.consumer_key,
114
124
  privatekey_file=pkey_value if pkey_type is Path else None,
115
125
  privatekey=pkey_value if pkey_type is str else None,
116
126
  version=SALESFORCE_API_VERSION,
117
127
  )
118
128
 
119
129
 
120
- @dataclass
121
130
  class SalesforceIndexerConfig(IndexerConfig):
122
131
  categories: list[str]
123
132
 
@@ -132,6 +141,13 @@ class SalesforceIndexer(Indexer):
132
141
  if record_type not in ACCEPTED_CATEGORIES:
133
142
  raise ValueError(f"{record_type} not currently an accepted Salesforce category")
134
143
 
144
+ def precheck(self) -> None:
145
+ try:
146
+ self.connection_config.get_client()
147
+ except Exception as e:
148
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
149
+ raise SourceConnectionError(f"failed to validate connection: {e}")
150
+
135
151
  def get_file_extension(self, record_type) -> str:
136
152
  if record_type == "EmailMessage":
137
153
  extension = ".eml"
@@ -172,7 +188,7 @@ class SalesforceIndexer(Indexer):
172
188
  filename=record_with_extension,
173
189
  fullpath=f"{record['attributes']['type']}/{record_with_extension}",
174
190
  ),
175
- metadata=DataSourceMetadata(
191
+ metadata=FileDataSourceMetadata(
176
192
  url=record["attributes"]["url"],
177
193
  version=str(parser.parse(record["SystemModstamp"]).timestamp()),
178
194
  date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
@@ -194,7 +210,6 @@ class SalesforceIndexer(Indexer):
194
210
  yield f
195
211
 
196
212
 
197
- @dataclass
198
213
  class SalesforceDownloaderConfig(DownloaderConfig):
199
214
  pass
200
215
 
@@ -207,11 +222,6 @@ class SalesforceDownloader(Downloader):
207
222
  )
208
223
  connector_type: str = CONNECTOR_TYPE
209
224
 
210
- def get_download_path(self, file_data: FileData) -> Path:
211
- rel_path = file_data.source_identifiers.relative_path
212
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
213
- return self.download_dir / Path(rel_path)
214
-
215
225
  def _xml_for_record(self, record: OrderedDict) -> str:
216
226
  """Creates partitionable xml file from a record"""
217
227
  import xml.etree.ElementTree as ET