unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -3,9 +3,10 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
9
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -30,25 +31,28 @@ CONNECTOR_TYPE = "mongodb"
|
|
|
30
31
|
SERVER_API_VERSION = "1"
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
@dataclass
|
|
34
34
|
class MongoDBAccessConfig(AccessConfig):
|
|
35
|
-
uri: Optional[str] = None
|
|
35
|
+
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
|
|
36
39
|
|
|
37
40
|
|
|
38
|
-
@dataclass
|
|
39
41
|
class MongoDBConnectionConfig(ConnectionConfig):
|
|
40
|
-
access_config:
|
|
41
|
-
|
|
42
|
+
access_config: SecretMongoDBAccessConfig = Field(
|
|
43
|
+
default_factory=lambda: SecretMongoDBAccessConfig(secret_value=MongoDBAccessConfig())
|
|
42
44
|
)
|
|
43
|
-
host: Optional[str] =
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
host: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
48
|
+
"mongos instance to connect to, or a list of hostnames",
|
|
49
|
+
)
|
|
50
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
51
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
52
|
+
port: int = Field(default=27017)
|
|
53
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
49
54
|
|
|
50
55
|
|
|
51
|
-
@dataclass
|
|
52
56
|
class MongoDBUploadStagerConfig(UploadStagerConfig):
|
|
53
57
|
pass
|
|
54
58
|
|
|
@@ -76,20 +80,23 @@ class MongoDBUploadStager(UploadStager):
|
|
|
76
80
|
return output_path
|
|
77
81
|
|
|
78
82
|
|
|
79
|
-
@dataclass
|
|
80
83
|
class MongoDBUploaderConfig(UploaderConfig):
|
|
81
|
-
batch_size: int = 100
|
|
84
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
82
85
|
|
|
83
86
|
|
|
84
87
|
@dataclass
|
|
85
88
|
class MongoDBUploader(Uploader):
|
|
86
89
|
upload_config: MongoDBUploaderConfig
|
|
87
90
|
connection_config: MongoDBConnectionConfig
|
|
88
|
-
client: Optional["MongoClient"] = field(init=False)
|
|
89
91
|
connector_type: str = CONNECTOR_TYPE
|
|
90
92
|
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
+
def precheck(self) -> None:
|
|
94
|
+
try:
|
|
95
|
+
client = self.create_client()
|
|
96
|
+
client.admin.command("ping")
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
99
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
93
100
|
|
|
94
101
|
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
95
102
|
def create_client(self) -> "MongoClient":
|
|
@@ -97,9 +104,11 @@ class MongoDBUploader(Uploader):
|
|
|
97
104
|
from pymongo.driver_info import DriverInfo
|
|
98
105
|
from pymongo.server_api import ServerApi
|
|
99
106
|
|
|
100
|
-
|
|
107
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
108
|
+
|
|
109
|
+
if access_config.uri:
|
|
101
110
|
return MongoClient(
|
|
102
|
-
|
|
111
|
+
access_config.uri,
|
|
103
112
|
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
104
113
|
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
105
114
|
)
|
|
@@ -123,7 +132,8 @@ class MongoDBUploader(Uploader):
|
|
|
123
132
|
f"collection {self.connection_config.collection} "
|
|
124
133
|
f"at {self.connection_config.host}",
|
|
125
134
|
)
|
|
126
|
-
|
|
135
|
+
client = self.create_client()
|
|
136
|
+
db = client[self.connection_config.database]
|
|
127
137
|
collection = db[self.connection_config.collection]
|
|
128
138
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
129
139
|
collection.insert_many(chunk)
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from time import time
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
10
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
12
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
16
|
DownloaderConfig,
|
|
18
17
|
DownloadResponse,
|
|
19
18
|
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
20
20
|
Indexer,
|
|
21
21
|
IndexerConfig,
|
|
22
22
|
SourceIdentifiers,
|
|
@@ -35,18 +35,23 @@ CONNECTOR_TYPE = "onedrive"
|
|
|
35
35
|
MAX_MB_SIZE = 512_000_000
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
@dataclass
|
|
39
38
|
class OnedriveAccessConfig(AccessConfig):
|
|
40
|
-
client_cred: str
|
|
39
|
+
client_cred: str = Field(description="Microsoft App client secret")
|
|
41
40
|
|
|
42
41
|
|
|
43
|
-
@dataclass
|
|
44
42
|
class OnedriveConnectionConfig(ConnectionConfig):
|
|
45
|
-
client_id: str
|
|
46
|
-
user_pname: str
|
|
47
|
-
tenant: str =
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
client_id: str = Field(description="Microsoft app client ID")
|
|
44
|
+
user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
|
|
45
|
+
tenant: str = Field(
|
|
46
|
+
repr=False, description="ID or domain name associated with your Azure AD instance"
|
|
47
|
+
)
|
|
48
|
+
authority_url: Optional[str] = Field(
|
|
49
|
+
repr=False,
|
|
50
|
+
default="https://login.microsoftonline.com",
|
|
51
|
+
examples=["https://login.microsoftonline.com"],
|
|
52
|
+
description="Authentication token provider for Microsoft apps",
|
|
53
|
+
)
|
|
54
|
+
access_config: Secret[OnedriveAccessConfig]
|
|
50
55
|
|
|
51
56
|
@requires_dependencies(["msal"], extras="onedrive")
|
|
52
57
|
def get_token(self):
|
|
@@ -56,7 +61,7 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
56
61
|
app = ConfidentialClientApplication(
|
|
57
62
|
authority=f"{self.authority_url}/{self.tenant}",
|
|
58
63
|
client_id=self.client_id,
|
|
59
|
-
client_credential=self.access_config.client_cred,
|
|
64
|
+
client_credential=self.access_config.get_secret_value().client_cred,
|
|
60
65
|
)
|
|
61
66
|
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
62
67
|
except ValueError as exc:
|
|
@@ -76,9 +81,8 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
76
81
|
return client
|
|
77
82
|
|
|
78
83
|
|
|
79
|
-
@dataclass
|
|
80
84
|
class OnedriveIndexerConfig(IndexerConfig):
|
|
81
|
-
path: Optional[str] =
|
|
85
|
+
path: Optional[str] = Field(default="")
|
|
82
86
|
recursive: bool = False
|
|
83
87
|
|
|
84
88
|
|
|
@@ -87,6 +91,18 @@ class OnedriveIndexer(Indexer):
|
|
|
87
91
|
connection_config: OnedriveConnectionConfig
|
|
88
92
|
index_config: OnedriveIndexerConfig
|
|
89
93
|
|
|
94
|
+
def precheck(self) -> None:
|
|
95
|
+
try:
|
|
96
|
+
token_resp: dict = self.connection_config.get_token()
|
|
97
|
+
if error := token_resp.get("error"):
|
|
98
|
+
raise SourceConnectionError(
|
|
99
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
100
|
+
)
|
|
101
|
+
self.connection_config.get_client()
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
104
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
105
|
+
|
|
90
106
|
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
91
107
|
drive_items = folder.children.get().execute_query()
|
|
92
108
|
files = [d for d in drive_items if d.is_file]
|
|
@@ -136,7 +152,7 @@ class OnedriveIndexer(Indexer):
|
|
|
136
152
|
source_identifiers=SourceIdentifiers(
|
|
137
153
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
138
154
|
),
|
|
139
|
-
metadata=
|
|
155
|
+
metadata=FileDataSourceMetadata(
|
|
140
156
|
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
141
157
|
version=drive_item.etag,
|
|
142
158
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -159,7 +175,6 @@ class OnedriveIndexer(Indexer):
|
|
|
159
175
|
yield file_data
|
|
160
176
|
|
|
161
177
|
|
|
162
|
-
@dataclass
|
|
163
178
|
class OnedriveDownloaderConfig(DownloaderConfig):
|
|
164
179
|
pass
|
|
165
180
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
from typing import TYPE_CHECKING, Optional
|
|
3
4
|
|
|
4
|
-
from
|
|
5
|
+
from pydantic import BaseModel, Field, Secret
|
|
6
|
+
|
|
5
7
|
from unstructured_ingest.error import (
|
|
6
8
|
DestinationConnectionError,
|
|
7
9
|
)
|
|
@@ -35,20 +37,28 @@ CONNECTOR_TYPE = "opensearch"
|
|
|
35
37
|
heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
@dataclass
|
|
39
40
|
class OpenSearchAccessConfig(AccessConfig):
|
|
40
|
-
password: Optional[str] =
|
|
41
|
-
use_ssl: bool = False
|
|
42
|
-
verify_certs: bool = False
|
|
43
|
-
ssl_show_warn: bool =
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
41
|
+
password: Optional[str] = Field(default=None, description="password when using basic auth")
|
|
42
|
+
use_ssl: bool = Field(default=False, description="use ssl for the connection")
|
|
43
|
+
verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
|
|
44
|
+
ssl_show_warn: bool = Field(
|
|
45
|
+
default=False, description="show warning when verify certs is disabled"
|
|
46
|
+
)
|
|
47
|
+
ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
|
|
48
|
+
client_cert: Optional[Path] = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description="path to the file containing the private key and the certificate,"
|
|
51
|
+
" or cert only if using client_key",
|
|
52
|
+
)
|
|
53
|
+
client_key: Optional[Path] = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="path to the file containing the private key"
|
|
56
|
+
" if using separate cert and key files",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class OpenSearchClientInput(BaseModel):
|
|
61
|
+
http_auth: Secret[Optional[tuple[str, str]]] = None
|
|
52
62
|
hosts: Optional[list[str]] = None
|
|
53
63
|
use_ssl: bool = False
|
|
54
64
|
verify_certs: bool = False
|
|
@@ -58,37 +68,41 @@ class OpenSearchClientInput(EnhancedDataClassJsonMixin):
|
|
|
58
68
|
client_key: Optional[str] = None
|
|
59
69
|
|
|
60
70
|
|
|
61
|
-
@dataclass
|
|
62
71
|
class OpenSearchConnectionConfig(ConnectionConfig):
|
|
63
|
-
hosts: Optional[list[str]] =
|
|
64
|
-
|
|
65
|
-
|
|
72
|
+
hosts: Optional[list[str]] = Field(
|
|
73
|
+
default=None,
|
|
74
|
+
description="List of the OpenSearch hosts to connect",
|
|
75
|
+
examples=["http://localhost:9200"],
|
|
76
|
+
)
|
|
77
|
+
username: Optional[str] = Field(default=None, description="username when using basic auth")
|
|
78
|
+
access_config: Secret[OpenSearchAccessConfig]
|
|
66
79
|
|
|
67
80
|
def get_client_kwargs(self) -> dict:
|
|
68
81
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
69
82
|
# supported methods:
|
|
70
83
|
# https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
|
|
71
|
-
|
|
84
|
+
access_config = self.access_config.get_secret_value()
|
|
85
|
+
client_input_kwargs = {}
|
|
72
86
|
if self.hosts:
|
|
73
|
-
|
|
74
|
-
if
|
|
75
|
-
|
|
76
|
-
if
|
|
77
|
-
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
if
|
|
81
|
-
|
|
82
|
-
if
|
|
83
|
-
|
|
84
|
-
if
|
|
85
|
-
|
|
86
|
-
if self.username and
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
)
|
|
91
|
-
client_kwargs = client_input.
|
|
87
|
+
client_input_kwargs["hosts"] = self.hosts
|
|
88
|
+
if access_config.use_ssl:
|
|
89
|
+
client_input_kwargs["use_ssl"] = access_config.use_ssl
|
|
90
|
+
if access_config.verify_certs:
|
|
91
|
+
client_input_kwargs["verify_certs"] = access_config.verify_certs
|
|
92
|
+
if access_config.ssl_show_warn:
|
|
93
|
+
client_input_kwargs["ssl_show_warn"] = access_config.ssl_show_warn
|
|
94
|
+
if access_config.ca_certs:
|
|
95
|
+
client_input_kwargs["ca_certs"] = str(access_config.ca_certs)
|
|
96
|
+
if access_config.client_cert:
|
|
97
|
+
client_input_kwargs["client_cert"] = str(access_config.client_cert)
|
|
98
|
+
if access_config.client_key:
|
|
99
|
+
client_input_kwargs["client_key"] = str(access_config.client_key)
|
|
100
|
+
if self.username and access_config.password:
|
|
101
|
+
client_input_kwargs["http_auth"] = (self.username, access_config.password)
|
|
102
|
+
client_input = OpenSearchClientInput(**client_input_kwargs)
|
|
103
|
+
logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
|
|
104
|
+
client_kwargs = client_input.dict()
|
|
105
|
+
client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
|
|
92
106
|
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
|
93
107
|
return client_kwargs
|
|
94
108
|
|
|
@@ -100,9 +114,14 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
100
114
|
return OpenSearch(**self.get_client_kwargs())
|
|
101
115
|
|
|
102
116
|
|
|
117
|
+
class OpenSearchIndexerConfig(ElasticsearchIndexerConfig):
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
|
|
103
121
|
@dataclass
|
|
104
122
|
class OpenSearchIndexer(ElasticsearchIndexer):
|
|
105
123
|
connection_config: OpenSearchConnectionConfig
|
|
124
|
+
index_config: OpenSearchIndexerConfig
|
|
106
125
|
client: "OpenSearch" = field(init=False)
|
|
107
126
|
|
|
108
127
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -112,9 +131,14 @@ class OpenSearchIndexer(ElasticsearchIndexer):
|
|
|
112
131
|
return scan
|
|
113
132
|
|
|
114
133
|
|
|
134
|
+
class OpenSearchDownloaderConfig(ElasticsearchDownloaderConfig):
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
|
|
115
138
|
@dataclass
|
|
116
139
|
class OpenSearchDownloader(ElasticsearchDownloader):
|
|
117
140
|
connection_config: OpenSearchConnectionConfig
|
|
141
|
+
download_config: OpenSearchDownloaderConfig
|
|
118
142
|
connector_type: str = CONNECTOR_TYPE
|
|
119
143
|
|
|
120
144
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -125,9 +149,14 @@ class OpenSearchDownloader(ElasticsearchDownloader):
|
|
|
125
149
|
return AsyncOpenSearch, async_scan
|
|
126
150
|
|
|
127
151
|
|
|
152
|
+
class OpenSearchUploaderConfig(ElasticsearchUploaderConfig):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
|
|
128
156
|
@dataclass
|
|
129
157
|
class OpenSearchUploader(ElasticsearchUploader):
|
|
130
158
|
connection_config: OpenSearchConnectionConfig
|
|
159
|
+
upload_config: OpenSearchUploaderConfig
|
|
131
160
|
connector_type: str = CONNECTOR_TYPE
|
|
132
161
|
|
|
133
162
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -137,19 +166,28 @@ class OpenSearchUploader(ElasticsearchUploader):
|
|
|
137
166
|
return parallel_bulk
|
|
138
167
|
|
|
139
168
|
|
|
169
|
+
class OpenSearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass
|
|
174
|
+
class OpenSearchUploadStager(ElasticsearchUploadStager):
|
|
175
|
+
upload_stager_config: OpenSearchUploadStagerConfig
|
|
176
|
+
|
|
177
|
+
|
|
140
178
|
opensearch_source_entry = SourceRegistryEntry(
|
|
141
179
|
connection_config=OpenSearchConnectionConfig,
|
|
142
180
|
indexer=OpenSearchIndexer,
|
|
143
|
-
indexer_config=
|
|
181
|
+
indexer_config=OpenSearchIndexerConfig,
|
|
144
182
|
downloader=OpenSearchDownloader,
|
|
145
|
-
downloader_config=
|
|
183
|
+
downloader_config=OpenSearchDownloaderConfig,
|
|
146
184
|
)
|
|
147
185
|
|
|
148
186
|
|
|
149
187
|
opensearch_destination_entry = DestinationRegistryEntry(
|
|
150
188
|
connection_config=OpenSearchConnectionConfig,
|
|
151
|
-
upload_stager_config=
|
|
152
|
-
upload_stager=
|
|
153
|
-
uploader_config=
|
|
189
|
+
upload_stager_config=OpenSearchUploadStagerConfig,
|
|
190
|
+
upload_stager=OpenSearchUploadStager,
|
|
191
|
+
uploader_config=OpenSearchUploaderConfig,
|
|
154
192
|
uploader=OpenSearchUploader,
|
|
155
193
|
)
|
|
@@ -5,12 +5,11 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from unstructured.utils import requires_dependencies
|
|
8
|
+
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
10
|
from unstructured_ingest.error import DestinationConnectionError
|
|
13
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
13
|
from unstructured_ingest.v2.interfaces import (
|
|
15
14
|
AccessConfig,
|
|
16
15
|
ConnectionConfig,
|
|
@@ -32,25 +31,31 @@ if TYPE_CHECKING:
|
|
|
32
31
|
CONNECTOR_TYPE = "pinecone"
|
|
33
32
|
|
|
34
33
|
|
|
35
|
-
@dataclass
|
|
36
34
|
class PineconeAccessConfig(AccessConfig):
|
|
37
|
-
|
|
35
|
+
pinecone_api_key: Optional[str] = Field(
|
|
36
|
+
default=None, description="API key for Pinecone.", alias="api_key"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
@dataclass
|
|
41
43
|
class PineconeConnectionConfig(ConnectionConfig):
|
|
42
|
-
index_name: str
|
|
43
|
-
environment: str
|
|
44
|
-
access_config:
|
|
44
|
+
index_name: str = Field(description="Name of the index to connect to.")
|
|
45
|
+
environment: str = Field(description="Environment to connect to.")
|
|
46
|
+
access_config: SecretPineconeAccessConfig = Field(
|
|
47
|
+
default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
|
|
48
|
+
)
|
|
45
49
|
|
|
46
50
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
47
51
|
def get_index(self) -> "PineconeIndex":
|
|
48
52
|
from pinecone import Pinecone
|
|
49
|
-
|
|
53
|
+
|
|
54
|
+
from unstructured_ingest import __version__ as unstructured_version
|
|
50
55
|
|
|
51
56
|
pc = Pinecone(
|
|
52
|
-
api_key=self.access_config.
|
|
53
|
-
source_tag=f"
|
|
57
|
+
api_key=self.access_config.get_secret_value().pinecone_api_key,
|
|
58
|
+
source_tag=f"unstructured_ingest=={unstructured_version}",
|
|
54
59
|
)
|
|
55
60
|
|
|
56
61
|
index = pc.Index(self.index_name)
|
|
@@ -58,15 +63,13 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
58
63
|
return index
|
|
59
64
|
|
|
60
65
|
|
|
61
|
-
@dataclass
|
|
62
66
|
class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
63
67
|
pass
|
|
64
68
|
|
|
65
69
|
|
|
66
|
-
@dataclass
|
|
67
70
|
class PineconeUploaderConfig(UploaderConfig):
|
|
68
|
-
batch_size: int = 100
|
|
69
|
-
|
|
71
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
72
|
+
num_processes: int = Field(default=4, description="Number of processes to use for uploading")
|
|
70
73
|
|
|
71
74
|
|
|
72
75
|
@dataclass
|
|
@@ -123,9 +126,12 @@ class PineconeUploader(Uploader):
|
|
|
123
126
|
connection_config: PineconeConnectionConfig
|
|
124
127
|
connector_type: str = CONNECTOR_TYPE
|
|
125
128
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
+
def precheck(self):
|
|
130
|
+
try:
|
|
131
|
+
self.connection_config.get_index()
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
134
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
129
135
|
|
|
130
136
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
131
137
|
def upsert_batch(self, batch):
|
|
@@ -151,18 +157,18 @@ class PineconeUploader(Uploader):
|
|
|
151
157
|
f" index named {self.connection_config.index_name}"
|
|
152
158
|
f" environment named {self.connection_config.environment}"
|
|
153
159
|
f" with batch size {self.upload_config.batch_size}"
|
|
154
|
-
f" with {self.upload_config.
|
|
160
|
+
f" with {self.upload_config.num_processes} (number of) processes"
|
|
155
161
|
)
|
|
156
162
|
|
|
157
163
|
pinecone_batch_size = self.upload_config.batch_size
|
|
158
164
|
|
|
159
|
-
if self.upload_config.
|
|
165
|
+
if self.upload_config.num_processes == 1:
|
|
160
166
|
for batch in batch_generator(elements_dict, pinecone_batch_size):
|
|
161
167
|
self.upsert_batch(batch) # noqa: E203
|
|
162
168
|
|
|
163
169
|
else:
|
|
164
170
|
with mp.Pool(
|
|
165
|
-
processes=self.upload_config.
|
|
171
|
+
processes=self.upload_config.num_processes,
|
|
166
172
|
) as pool:
|
|
167
173
|
pool.map(
|
|
168
174
|
self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
|
|
@@ -15,13 +15,12 @@ from email.utils import formatdate
|
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from string import Template
|
|
17
17
|
from textwrap import dedent
|
|
18
|
-
from typing import TYPE_CHECKING, Any, Generator, Type
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Type
|
|
19
19
|
|
|
20
20
|
from dateutil import parser
|
|
21
|
-
from
|
|
21
|
+
from pydantic import Field, Secret
|
|
22
22
|
|
|
23
|
-
from unstructured_ingest.
|
|
24
|
-
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
23
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
25
24
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
25
|
from unstructured_ingest.v2.interfaces import (
|
|
27
26
|
AccessConfig,
|
|
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
30
29
|
DownloaderConfig,
|
|
31
30
|
DownloadResponse,
|
|
32
31
|
FileData,
|
|
32
|
+
FileDataSourceMetadata,
|
|
33
33
|
Indexer,
|
|
34
34
|
IndexerConfig,
|
|
35
35
|
SourceIdentifiers,
|
|
@@ -75,49 +75,58 @@ $htmlbody
|
|
|
75
75
|
)
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
@dataclass
|
|
79
78
|
class SalesforceAccessConfig(AccessConfig):
|
|
80
79
|
consumer_key: str
|
|
81
|
-
|
|
80
|
+
private_key_path: Optional[Path] = Field(
|
|
81
|
+
default=None,
|
|
82
|
+
description="Path to the private key file. " "Key file is usually named server.key.",
|
|
83
|
+
)
|
|
84
|
+
private_key: Optional[str] = Field(default=None, description="Contents of the private key")
|
|
85
|
+
|
|
86
|
+
def model_post_init(self, __context: Any) -> None:
|
|
87
|
+
if self.private_key_path is None and self.private_key is None:
|
|
88
|
+
raise ValueError("either private_key or private_key_path must be set")
|
|
89
|
+
if self.private_key is not None and self.private_key_path is not None:
|
|
90
|
+
raise ValueError("only one of private_key or private_key_path must be set")
|
|
82
91
|
|
|
83
92
|
@requires_dependencies(["cryptography"])
|
|
84
93
|
def get_private_key_value_and_type(self) -> tuple[str, Type]:
|
|
85
94
|
from cryptography.hazmat.primitives import serialization
|
|
86
95
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
96
|
+
if self.private_key_path and self.private_key_path.is_file():
|
|
97
|
+
return str(self.private_key_path), Path
|
|
98
|
+
if self.private_key:
|
|
99
|
+
try:
|
|
100
|
+
serialization.load_pem_private_key(
|
|
101
|
+
data=str(self.private_key).encode("utf-8"), password=None
|
|
102
|
+
)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise ValueError(f"failed to validate private key data: {e}") from e
|
|
92
105
|
return self.private_key, str
|
|
93
106
|
|
|
94
|
-
if Path(self.private_key).is_file():
|
|
95
|
-
return self.private_key, Path
|
|
96
|
-
|
|
97
107
|
raise ValueError("private_key does not contain PEM private key or path")
|
|
98
108
|
|
|
99
109
|
|
|
100
|
-
@dataclass
|
|
101
110
|
class SalesforceConnectionConfig(ConnectionConfig):
|
|
102
111
|
username: str
|
|
103
|
-
access_config: SalesforceAccessConfig
|
|
112
|
+
access_config: Secret[SalesforceAccessConfig]
|
|
104
113
|
|
|
105
114
|
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
106
115
|
def get_client(self) -> "Salesforce":
|
|
107
116
|
from simple_salesforce import Salesforce
|
|
108
117
|
|
|
109
|
-
|
|
118
|
+
access_config = self.access_config.get_secret_value()
|
|
119
|
+
pkey_value, pkey_type = access_config.get_private_key_value_and_type()
|
|
110
120
|
|
|
111
121
|
return Salesforce(
|
|
112
122
|
username=self.username,
|
|
113
|
-
consumer_key=
|
|
123
|
+
consumer_key=access_config.consumer_key,
|
|
114
124
|
privatekey_file=pkey_value if pkey_type is Path else None,
|
|
115
125
|
privatekey=pkey_value if pkey_type is str else None,
|
|
116
126
|
version=SALESFORCE_API_VERSION,
|
|
117
127
|
)
|
|
118
128
|
|
|
119
129
|
|
|
120
|
-
@dataclass
|
|
121
130
|
class SalesforceIndexerConfig(IndexerConfig):
|
|
122
131
|
categories: list[str]
|
|
123
132
|
|
|
@@ -132,6 +141,13 @@ class SalesforceIndexer(Indexer):
|
|
|
132
141
|
if record_type not in ACCEPTED_CATEGORIES:
|
|
133
142
|
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
134
143
|
|
|
144
|
+
def precheck(self) -> None:
|
|
145
|
+
try:
|
|
146
|
+
self.connection_config.get_client()
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
149
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
150
|
+
|
|
135
151
|
def get_file_extension(self, record_type) -> str:
|
|
136
152
|
if record_type == "EmailMessage":
|
|
137
153
|
extension = ".eml"
|
|
@@ -172,7 +188,7 @@ class SalesforceIndexer(Indexer):
|
|
|
172
188
|
filename=record_with_extension,
|
|
173
189
|
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
174
190
|
),
|
|
175
|
-
metadata=
|
|
191
|
+
metadata=FileDataSourceMetadata(
|
|
176
192
|
url=record["attributes"]["url"],
|
|
177
193
|
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
178
194
|
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
@@ -194,7 +210,6 @@ class SalesforceIndexer(Indexer):
|
|
|
194
210
|
yield f
|
|
195
211
|
|
|
196
212
|
|
|
197
|
-
@dataclass
|
|
198
213
|
class SalesforceDownloaderConfig(DownloaderConfig):
|
|
199
214
|
pass
|
|
200
215
|
|
|
@@ -207,11 +222,6 @@ class SalesforceDownloader(Downloader):
|
|
|
207
222
|
)
|
|
208
223
|
connector_type: str = CONNECTOR_TYPE
|
|
209
224
|
|
|
210
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
211
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
212
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
213
|
-
return self.download_dir / Path(rel_path)
|
|
214
|
-
|
|
215
225
|
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
216
226
|
"""Creates partitionable xml file from a record"""
|
|
217
227
|
import xml.etree.ElementTree as ET
|