unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +1 -5
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/local.py +22 -14
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
from typing import TYPE_CHECKING, Optional
|
|
3
4
|
|
|
4
|
-
from
|
|
5
|
+
from pydantic import BaseModel, Field, Secret
|
|
6
|
+
|
|
5
7
|
from unstructured_ingest.error import (
|
|
6
8
|
DestinationConnectionError,
|
|
7
9
|
)
|
|
@@ -35,20 +37,28 @@ CONNECTOR_TYPE = "opensearch"
|
|
|
35
37
|
heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
@dataclass
|
|
39
40
|
class OpenSearchAccessConfig(AccessConfig):
|
|
40
|
-
password: Optional[str] =
|
|
41
|
-
use_ssl: bool = False
|
|
42
|
-
verify_certs: bool = False
|
|
43
|
-
ssl_show_warn: bool =
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
41
|
+
password: Optional[str] = Field(default=None, description="password when using basic auth")
|
|
42
|
+
use_ssl: bool = Field(default=False, description="use ssl for the connection")
|
|
43
|
+
verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
|
|
44
|
+
ssl_show_warn: bool = Field(
|
|
45
|
+
default=False, description="show warning when verify certs is disabled"
|
|
46
|
+
)
|
|
47
|
+
ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
|
|
48
|
+
client_cert: Optional[Path] = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description="path to the file containing the private key and the certificate,"
|
|
51
|
+
" or cert only if using client_key",
|
|
52
|
+
)
|
|
53
|
+
client_key: Optional[Path] = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="path to the file containing the private key"
|
|
56
|
+
" if using separate cert and key files",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class OpenSearchClientInput(BaseModel):
|
|
61
|
+
http_auth: Secret[Optional[tuple[str, str]]] = None
|
|
52
62
|
hosts: Optional[list[str]] = None
|
|
53
63
|
use_ssl: bool = False
|
|
54
64
|
verify_certs: bool = False
|
|
@@ -58,37 +68,41 @@ class OpenSearchClientInput(EnhancedDataClassJsonMixin):
|
|
|
58
68
|
client_key: Optional[str] = None
|
|
59
69
|
|
|
60
70
|
|
|
61
|
-
@dataclass
|
|
62
71
|
class OpenSearchConnectionConfig(ConnectionConfig):
|
|
63
|
-
hosts: Optional[list[str]] =
|
|
64
|
-
|
|
65
|
-
|
|
72
|
+
hosts: Optional[list[str]] = Field(
|
|
73
|
+
default=None,
|
|
74
|
+
description="List of the OpenSearch hosts to connect",
|
|
75
|
+
examples=["http://localhost:9200"],
|
|
76
|
+
)
|
|
77
|
+
username: Optional[str] = Field(default=None, description="username when using basic auth")
|
|
78
|
+
access_config: Secret[OpenSearchAccessConfig]
|
|
66
79
|
|
|
67
80
|
def get_client_kwargs(self) -> dict:
|
|
68
81
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
69
82
|
# supported methods:
|
|
70
83
|
# https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
|
|
71
|
-
|
|
84
|
+
access_config = self.access_config.get_secret_value()
|
|
85
|
+
client_input_kwargs = {}
|
|
72
86
|
if self.hosts:
|
|
73
|
-
|
|
74
|
-
if
|
|
75
|
-
|
|
76
|
-
if
|
|
77
|
-
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
if
|
|
81
|
-
|
|
82
|
-
if
|
|
83
|
-
|
|
84
|
-
if
|
|
85
|
-
|
|
86
|
-
if self.username and
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
)
|
|
91
|
-
client_kwargs = client_input.
|
|
87
|
+
client_input_kwargs["hosts"] = self.hosts
|
|
88
|
+
if access_config.use_ssl:
|
|
89
|
+
client_input_kwargs["use_ssl"] = access_config.use_ssl
|
|
90
|
+
if access_config.verify_certs:
|
|
91
|
+
client_input_kwargs["verify_certs"] = access_config.verify_certs
|
|
92
|
+
if access_config.ssl_show_warn:
|
|
93
|
+
client_input_kwargs["ssl_show_warn"] = access_config.ssl_show_warn
|
|
94
|
+
if access_config.ca_certs:
|
|
95
|
+
client_input_kwargs["ca_certs"] = str(access_config.ca_certs)
|
|
96
|
+
if access_config.client_cert:
|
|
97
|
+
client_input_kwargs["client_cert"] = str(access_config.client_cert)
|
|
98
|
+
if access_config.client_key:
|
|
99
|
+
client_input_kwargs["client_key"] = str(access_config.client_key)
|
|
100
|
+
if self.username and access_config.password:
|
|
101
|
+
client_input_kwargs["http_auth"] = (self.username, access_config.password)
|
|
102
|
+
client_input = OpenSearchClientInput(**client_input_kwargs)
|
|
103
|
+
logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
|
|
104
|
+
client_kwargs = client_input.dict()
|
|
105
|
+
client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
|
|
92
106
|
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
|
93
107
|
return client_kwargs
|
|
94
108
|
|
|
@@ -100,15 +114,14 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
100
114
|
return OpenSearch(**self.get_client_kwargs())
|
|
101
115
|
|
|
102
116
|
|
|
103
|
-
|
|
104
|
-
class OpensearchIndexerConfig(ElasticsearchIndexerConfig):
|
|
117
|
+
class OpenSearchIndexerConfig(ElasticsearchIndexerConfig):
|
|
105
118
|
pass
|
|
106
119
|
|
|
107
120
|
|
|
108
121
|
@dataclass
|
|
109
122
|
class OpenSearchIndexer(ElasticsearchIndexer):
|
|
110
123
|
connection_config: OpenSearchConnectionConfig
|
|
111
|
-
index_config:
|
|
124
|
+
index_config: OpenSearchIndexerConfig
|
|
112
125
|
client: "OpenSearch" = field(init=False)
|
|
113
126
|
|
|
114
127
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -118,15 +131,14 @@ class OpenSearchIndexer(ElasticsearchIndexer):
|
|
|
118
131
|
return scan
|
|
119
132
|
|
|
120
133
|
|
|
121
|
-
|
|
122
|
-
class OpensearchDownloaderConfig(ElasticsearchDownloaderConfig):
|
|
134
|
+
class OpenSearchDownloaderConfig(ElasticsearchDownloaderConfig):
|
|
123
135
|
pass
|
|
124
136
|
|
|
125
137
|
|
|
126
138
|
@dataclass
|
|
127
139
|
class OpenSearchDownloader(ElasticsearchDownloader):
|
|
128
140
|
connection_config: OpenSearchConnectionConfig
|
|
129
|
-
download_config:
|
|
141
|
+
download_config: OpenSearchDownloaderConfig
|
|
130
142
|
connector_type: str = CONNECTOR_TYPE
|
|
131
143
|
|
|
132
144
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -137,15 +149,14 @@ class OpenSearchDownloader(ElasticsearchDownloader):
|
|
|
137
149
|
return AsyncOpenSearch, async_scan
|
|
138
150
|
|
|
139
151
|
|
|
140
|
-
|
|
141
|
-
class OpensearchUploaderConfig(ElasticsearchUploaderConfig):
|
|
152
|
+
class OpenSearchUploaderConfig(ElasticsearchUploaderConfig):
|
|
142
153
|
pass
|
|
143
154
|
|
|
144
155
|
|
|
145
156
|
@dataclass
|
|
146
157
|
class OpenSearchUploader(ElasticsearchUploader):
|
|
147
158
|
connection_config: OpenSearchConnectionConfig
|
|
148
|
-
upload_config:
|
|
159
|
+
upload_config: OpenSearchUploaderConfig
|
|
149
160
|
connector_type: str = CONNECTOR_TYPE
|
|
150
161
|
|
|
151
162
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -155,29 +166,28 @@ class OpenSearchUploader(ElasticsearchUploader):
|
|
|
155
166
|
return parallel_bulk
|
|
156
167
|
|
|
157
168
|
|
|
158
|
-
|
|
159
|
-
class OpensearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
|
|
169
|
+
class OpenSearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
|
|
160
170
|
pass
|
|
161
171
|
|
|
162
172
|
|
|
163
173
|
@dataclass
|
|
164
|
-
class
|
|
165
|
-
upload_stager_config:
|
|
174
|
+
class OpenSearchUploadStager(ElasticsearchUploadStager):
|
|
175
|
+
upload_stager_config: OpenSearchUploadStagerConfig
|
|
166
176
|
|
|
167
177
|
|
|
168
178
|
opensearch_source_entry = SourceRegistryEntry(
|
|
169
179
|
connection_config=OpenSearchConnectionConfig,
|
|
170
180
|
indexer=OpenSearchIndexer,
|
|
171
|
-
indexer_config=
|
|
181
|
+
indexer_config=OpenSearchIndexerConfig,
|
|
172
182
|
downloader=OpenSearchDownloader,
|
|
173
|
-
downloader_config=
|
|
183
|
+
downloader_config=OpenSearchDownloaderConfig,
|
|
174
184
|
)
|
|
175
185
|
|
|
176
186
|
|
|
177
187
|
opensearch_destination_entry = DestinationRegistryEntry(
|
|
178
188
|
connection_config=OpenSearchConnectionConfig,
|
|
179
|
-
upload_stager_config=
|
|
180
|
-
upload_stager=
|
|
181
|
-
uploader_config=
|
|
189
|
+
upload_stager_config=OpenSearchUploadStagerConfig,
|
|
190
|
+
upload_stager=OpenSearchUploadStager,
|
|
191
|
+
uploader_config=OpenSearchUploaderConfig,
|
|
182
192
|
uploader=OpenSearchUploader,
|
|
183
193
|
)
|
|
@@ -5,12 +5,11 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from unstructured.utils import requires_dependencies
|
|
8
|
+
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
10
|
from unstructured_ingest.error import DestinationConnectionError
|
|
13
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
13
|
from unstructured_ingest.v2.interfaces import (
|
|
15
14
|
AccessConfig,
|
|
16
15
|
ConnectionConfig,
|
|
@@ -32,25 +31,31 @@ if TYPE_CHECKING:
|
|
|
32
31
|
CONNECTOR_TYPE = "pinecone"
|
|
33
32
|
|
|
34
33
|
|
|
35
|
-
@dataclass
|
|
36
34
|
class PineconeAccessConfig(AccessConfig):
|
|
37
|
-
|
|
35
|
+
pinecone_api_key: Optional[str] = Field(
|
|
36
|
+
default=None, description="API key for Pinecone.", alias="api_key"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
@dataclass
|
|
41
43
|
class PineconeConnectionConfig(ConnectionConfig):
|
|
42
|
-
index_name: str
|
|
43
|
-
environment: str
|
|
44
|
-
access_config:
|
|
44
|
+
index_name: str = Field(description="Name of the index to connect to.")
|
|
45
|
+
environment: str = Field(description="Environment to connect to.")
|
|
46
|
+
access_config: SecretPineconeAccessConfig = Field(
|
|
47
|
+
default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
|
|
48
|
+
)
|
|
45
49
|
|
|
46
50
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
47
51
|
def get_index(self) -> "PineconeIndex":
|
|
48
52
|
from pinecone import Pinecone
|
|
49
|
-
|
|
53
|
+
|
|
54
|
+
from unstructured_ingest import __version__ as unstructured_version
|
|
50
55
|
|
|
51
56
|
pc = Pinecone(
|
|
52
|
-
api_key=self.access_config.
|
|
53
|
-
source_tag=f"
|
|
57
|
+
api_key=self.access_config.get_secret_value().pinecone_api_key,
|
|
58
|
+
source_tag=f"unstructured_ingest=={unstructured_version}",
|
|
54
59
|
)
|
|
55
60
|
|
|
56
61
|
index = pc.Index(self.index_name)
|
|
@@ -58,15 +63,13 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
58
63
|
return index
|
|
59
64
|
|
|
60
65
|
|
|
61
|
-
@dataclass
|
|
62
66
|
class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
63
67
|
pass
|
|
64
68
|
|
|
65
69
|
|
|
66
|
-
@dataclass
|
|
67
70
|
class PineconeUploaderConfig(UploaderConfig):
|
|
68
|
-
batch_size: int = 100
|
|
69
|
-
|
|
71
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
72
|
+
num_processes: int = Field(default=4, description="Number of processes to use for uploading")
|
|
70
73
|
|
|
71
74
|
|
|
72
75
|
@dataclass
|
|
@@ -154,18 +157,18 @@ class PineconeUploader(Uploader):
|
|
|
154
157
|
f" index named {self.connection_config.index_name}"
|
|
155
158
|
f" environment named {self.connection_config.environment}"
|
|
156
159
|
f" with batch size {self.upload_config.batch_size}"
|
|
157
|
-
f" with {self.upload_config.
|
|
160
|
+
f" with {self.upload_config.num_processes} (number of) processes"
|
|
158
161
|
)
|
|
159
162
|
|
|
160
163
|
pinecone_batch_size = self.upload_config.batch_size
|
|
161
164
|
|
|
162
|
-
if self.upload_config.
|
|
165
|
+
if self.upload_config.num_processes == 1:
|
|
163
166
|
for batch in batch_generator(elements_dict, pinecone_batch_size):
|
|
164
167
|
self.upsert_batch(batch) # noqa: E203
|
|
165
168
|
|
|
166
169
|
else:
|
|
167
170
|
with mp.Pool(
|
|
168
|
-
processes=self.upload_config.
|
|
171
|
+
processes=self.upload_config.num_processes,
|
|
169
172
|
) as pool:
|
|
170
173
|
pool.map(
|
|
171
174
|
self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
|
|
@@ -15,11 +15,11 @@ from email.utils import formatdate
|
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from string import Template
|
|
17
17
|
from textwrap import dedent
|
|
18
|
-
from typing import TYPE_CHECKING, Any, Generator, Type
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Type
|
|
19
19
|
|
|
20
20
|
from dateutil import parser
|
|
21
|
+
from pydantic import Field, Secret
|
|
21
22
|
|
|
22
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
23
23
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
24
24
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
25
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -75,49 +75,58 @@ $htmlbody
|
|
|
75
75
|
)
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
@dataclass
|
|
79
78
|
class SalesforceAccessConfig(AccessConfig):
|
|
80
79
|
consumer_key: str
|
|
81
|
-
|
|
80
|
+
private_key_path: Optional[Path] = Field(
|
|
81
|
+
default=None,
|
|
82
|
+
description="Path to the private key file. " "Key file is usually named server.key.",
|
|
83
|
+
)
|
|
84
|
+
private_key: Optional[str] = Field(default=None, description="Contents of the private key")
|
|
85
|
+
|
|
86
|
+
def model_post_init(self, __context: Any) -> None:
|
|
87
|
+
if self.private_key_path is None and self.private_key is None:
|
|
88
|
+
raise ValueError("either private_key or private_key_path must be set")
|
|
89
|
+
if self.private_key is not None and self.private_key_path is not None:
|
|
90
|
+
raise ValueError("only one of private_key or private_key_path must be set")
|
|
82
91
|
|
|
83
92
|
@requires_dependencies(["cryptography"])
|
|
84
93
|
def get_private_key_value_and_type(self) -> tuple[str, Type]:
|
|
85
94
|
from cryptography.hazmat.primitives import serialization
|
|
86
95
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
96
|
+
if self.private_key_path and self.private_key_path.is_file():
|
|
97
|
+
return str(self.private_key_path), Path
|
|
98
|
+
if self.private_key:
|
|
99
|
+
try:
|
|
100
|
+
serialization.load_pem_private_key(
|
|
101
|
+
data=str(self.private_key).encode("utf-8"), password=None
|
|
102
|
+
)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise ValueError(f"failed to validate private key data: {e}") from e
|
|
92
105
|
return self.private_key, str
|
|
93
106
|
|
|
94
|
-
if Path(self.private_key).is_file():
|
|
95
|
-
return self.private_key, Path
|
|
96
|
-
|
|
97
107
|
raise ValueError("private_key does not contain PEM private key or path")
|
|
98
108
|
|
|
99
109
|
|
|
100
|
-
@dataclass
|
|
101
110
|
class SalesforceConnectionConfig(ConnectionConfig):
|
|
102
111
|
username: str
|
|
103
|
-
access_config: SalesforceAccessConfig
|
|
112
|
+
access_config: Secret[SalesforceAccessConfig]
|
|
104
113
|
|
|
105
114
|
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
106
115
|
def get_client(self) -> "Salesforce":
|
|
107
116
|
from simple_salesforce import Salesforce
|
|
108
117
|
|
|
109
|
-
|
|
118
|
+
access_config = self.access_config.get_secret_value()
|
|
119
|
+
pkey_value, pkey_type = access_config.get_private_key_value_and_type()
|
|
110
120
|
|
|
111
121
|
return Salesforce(
|
|
112
122
|
username=self.username,
|
|
113
|
-
consumer_key=
|
|
123
|
+
consumer_key=access_config.consumer_key,
|
|
114
124
|
privatekey_file=pkey_value if pkey_type is Path else None,
|
|
115
125
|
privatekey=pkey_value if pkey_type is str else None,
|
|
116
126
|
version=SALESFORCE_API_VERSION,
|
|
117
127
|
)
|
|
118
128
|
|
|
119
129
|
|
|
120
|
-
@dataclass
|
|
121
130
|
class SalesforceIndexerConfig(IndexerConfig):
|
|
122
131
|
categories: list[str]
|
|
123
132
|
|
|
@@ -201,7 +210,6 @@ class SalesforceIndexer(Indexer):
|
|
|
201
210
|
yield f
|
|
202
211
|
|
|
203
212
|
|
|
204
|
-
@dataclass
|
|
205
213
|
class SalesforceDownloaderConfig(DownloaderConfig):
|
|
206
214
|
pass
|
|
207
215
|
|
|
@@ -6,7 +6,8 @@ from time import time
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
from urllib.parse import quote
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
10
|
+
|
|
10
11
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -54,24 +55,38 @@ class SharepointContentType(Enum):
|
|
|
54
55
|
LIST = "list"
|
|
55
56
|
|
|
56
57
|
|
|
57
|
-
@dataclass
|
|
58
58
|
class SharepointAccessConfig(AccessConfig):
|
|
59
|
-
client_cred: str
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
59
|
+
client_cred: str = Field(description="Sharepoint app secret")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SharepointPermissionsConfig(BaseModel):
|
|
63
|
+
permissions_application_id: str = Field(description="Microsoft Graph API application id")
|
|
64
|
+
permissions_tenant: str = Field(
|
|
65
|
+
description="url to get permissions data within tenant.",
|
|
66
|
+
examples=["https://contoso.onmicrosoft.com"],
|
|
67
|
+
)
|
|
68
|
+
permissions_client_cred: SecretStr = Field(
|
|
69
|
+
description="Microsoft Graph API application credentials"
|
|
70
|
+
)
|
|
71
|
+
authority_url: Optional[SecretStr] = Field(
|
|
72
|
+
repr=False,
|
|
73
|
+
default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
|
|
74
|
+
description="Permissions authority url",
|
|
75
|
+
examples=["https://login.microsoftonline.com"],
|
|
76
|
+
)
|
|
68
77
|
|
|
69
78
|
|
|
70
|
-
@dataclass
|
|
71
79
|
class SharepointConnectionConfig(ConnectionConfig):
|
|
72
|
-
client_id: str
|
|
73
|
-
site: str
|
|
74
|
-
|
|
80
|
+
client_id: str = Field(description="Sharepoint app client ID")
|
|
81
|
+
site: str = Field(
|
|
82
|
+
description="Sharepoint site url. Process either base url e.g \
|
|
83
|
+
https://[tenant].sharepoint.com or relative sites \
|
|
84
|
+
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
85
|
+
To process all sites within the tenant pass a site url as \
|
|
86
|
+
https://[tenant]-admin.sharepoint.com.\
|
|
87
|
+
This requires the app to be registered at a tenant level"
|
|
88
|
+
)
|
|
89
|
+
access_config: Secret[SharepointAccessConfig]
|
|
75
90
|
permissions_config: Optional[SharepointPermissionsConfig] = None
|
|
76
91
|
|
|
77
92
|
@requires_dependencies(["office365"], extras="sharepoint")
|
|
@@ -80,7 +95,9 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
80
95
|
from office365.sharepoint.client_context import ClientContext
|
|
81
96
|
|
|
82
97
|
try:
|
|
83
|
-
credentials = ClientCredential(
|
|
98
|
+
credentials = ClientCredential(
|
|
99
|
+
self.client_id, self.access_config.get_secret_value().client_cred
|
|
100
|
+
)
|
|
84
101
|
site_client = ClientContext(self.site).with_credentials(credentials)
|
|
85
102
|
except Exception as e:
|
|
86
103
|
logger.error(f"Couldn't set Sharepoint client: {e}")
|
|
@@ -92,11 +109,12 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
92
109
|
from msal import ConfidentialClientApplication
|
|
93
110
|
|
|
94
111
|
try:
|
|
112
|
+
client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
|
|
95
113
|
app = ConfidentialClientApplication(
|
|
96
|
-
authority=f"{self.permissions_config.authority_url}/"
|
|
114
|
+
authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
|
|
97
115
|
f"{self.permissions_config.permissions_tenant}",
|
|
98
116
|
client_id=self.permissions_config.permissions_application_id,
|
|
99
|
-
client_credential=
|
|
117
|
+
client_credential=client_credential,
|
|
100
118
|
)
|
|
101
119
|
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
102
120
|
except ValueError as exc:
|
|
@@ -119,13 +137,21 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
119
137
|
return client
|
|
120
138
|
|
|
121
139
|
|
|
122
|
-
@dataclass
|
|
123
140
|
class SharepointIndexerConfig(IndexerConfig):
|
|
124
|
-
path: Optional[str] =
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
141
|
+
path: Optional[str] = Field(
|
|
142
|
+
defaul=None,
|
|
143
|
+
description="Path from which to start parsing files. If the connector is to \
|
|
144
|
+
process all sites within the tenant this filter will be applied to \
|
|
145
|
+
all sites document libraries.",
|
|
146
|
+
)
|
|
147
|
+
recursive: bool = Field(
|
|
148
|
+
default=False,
|
|
149
|
+
description="Recursively download files in their respective folders "
|
|
150
|
+
"otherwise stop at the files in provided folder level.",
|
|
151
|
+
)
|
|
152
|
+
omit_files: bool = Field(default=False, description="Don't process files.")
|
|
153
|
+
omit_pages: bool = Field(default=False, description="Don't process site pages.")
|
|
154
|
+
omit_lists: bool = Field(default=False, description="Don't process lists.")
|
|
129
155
|
|
|
130
156
|
|
|
131
157
|
@dataclass
|
|
@@ -310,7 +336,7 @@ class SharepointIndexer(Indexer):
|
|
|
310
336
|
def process_permissions(self) -> bool:
|
|
311
337
|
return (
|
|
312
338
|
self.connection_config.permissions_config.permissions_tenant
|
|
313
|
-
and self.connection_config.permissions_config.permissions_client_cred
|
|
339
|
+
and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
|
|
314
340
|
and self.connection_config.permissions_config.permissions_application_id
|
|
315
341
|
)
|
|
316
342
|
|
|
@@ -335,7 +361,6 @@ class SharepointIndexer(Indexer):
|
|
|
335
361
|
yield file_data
|
|
336
362
|
|
|
337
363
|
|
|
338
|
-
@dataclass
|
|
339
364
|
class SharepointDownloaderConfig(DownloaderConfig):
|
|
340
365
|
pass
|
|
341
366
|
|
|
@@ -7,8 +7,8 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from dateutil import parser
|
|
10
|
+
from pydantic import Field, Secret
|
|
10
11
|
|
|
11
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
12
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
14
|
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
@@ -33,18 +33,16 @@ if TYPE_CHECKING:
|
|
|
33
33
|
CONNECTOR_TYPE = "singlestore"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
@dataclass
|
|
37
36
|
class SingleStoreAccessConfig(AccessConfig):
|
|
38
|
-
password: Optional[str] = None
|
|
37
|
+
password: Optional[str] = Field(default=None, description="SingleStore password")
|
|
39
38
|
|
|
40
39
|
|
|
41
|
-
@dataclass
|
|
42
40
|
class SingleStoreConnectionConfig(ConnectionConfig):
|
|
43
|
-
host: Optional[str] = None
|
|
44
|
-
port: Optional[int] = None
|
|
45
|
-
user: Optional[str] = None
|
|
46
|
-
database: Optional[str] = None
|
|
47
|
-
access_config: SingleStoreAccessConfig
|
|
41
|
+
host: Optional[str] = Field(default=None, description="SingleStore host")
|
|
42
|
+
port: Optional[int] = Field(default=None, description="SingleStore port")
|
|
43
|
+
user: Optional[str] = Field(default=None, description="SingleStore user")
|
|
44
|
+
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
45
|
+
access_config: Secret[SingleStoreAccessConfig]
|
|
48
46
|
|
|
49
47
|
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
50
48
|
def get_connection(self) -> "Connection":
|
|
@@ -55,14 +53,13 @@ class SingleStoreConnectionConfig(ConnectionConfig):
|
|
|
55
53
|
port=self.port,
|
|
56
54
|
database=self.database,
|
|
57
55
|
user=self.user,
|
|
58
|
-
password=self.access_config.password,
|
|
56
|
+
password=self.access_config.get_secret_value().password,
|
|
59
57
|
)
|
|
60
58
|
return conn
|
|
61
59
|
|
|
62
60
|
|
|
63
|
-
@dataclass
|
|
64
61
|
class SingleStoreUploadStagerConfig(UploadStagerConfig):
|
|
65
|
-
drop_empty_cols: bool = False
|
|
62
|
+
drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
|
|
66
63
|
|
|
67
64
|
|
|
68
65
|
@dataclass
|
|
@@ -112,10 +109,9 @@ class SingleStoreUploadStager(UploadStager):
|
|
|
112
109
|
return output_path
|
|
113
110
|
|
|
114
111
|
|
|
115
|
-
@dataclass
|
|
116
112
|
class SingleStoreUploaderConfig(UploaderConfig):
|
|
117
|
-
table_name: str
|
|
118
|
-
batch_size: int = 100
|
|
113
|
+
table_name: str = Field(description="SingleStore table to write contents to")
|
|
114
|
+
batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
|
|
119
115
|
|
|
120
116
|
|
|
121
117
|
@dataclass
|