unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +1 -5
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/local.py +22 -14
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import typing as t
|
|
3
2
|
import uuid
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
6
8
|
|
|
7
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
9
|
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
9
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -20,27 +21,31 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
21
|
from unstructured_ingest.v2.logger import logger
|
|
21
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
23
|
DestinationRegistryEntry,
|
|
23
|
-
add_destination_entry,
|
|
24
24
|
)
|
|
25
25
|
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
26
26
|
|
|
27
|
-
if
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
28
|
from azure.search.documents import SearchClient
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class AzureCognitiveSearchAccessConfig(AccessConfig):
|
|
36
|
-
|
|
35
|
+
azure_cognitive_search_key: str = Field(
|
|
36
|
+
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
37
|
+
)
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
@dataclass
|
|
40
40
|
class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
41
|
-
endpoint: str
|
|
42
|
-
|
|
43
|
-
|
|
41
|
+
endpoint: str = Field(
|
|
42
|
+
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
43
|
+
"In the form of https://{{service_name}}.search.windows.net"
|
|
44
|
+
)
|
|
45
|
+
index: str = Field(
|
|
46
|
+
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
47
|
+
)
|
|
48
|
+
access_config: Secret[AzureCognitiveSearchAccessConfig]
|
|
44
49
|
|
|
45
50
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
|
|
46
51
|
def generate_client(self) -> "SearchClient":
|
|
@@ -50,18 +55,18 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
50
55
|
return SearchClient(
|
|
51
56
|
endpoint=self.endpoint,
|
|
52
57
|
index_name=self.index,
|
|
53
|
-
credential=AzureKeyCredential(
|
|
58
|
+
credential=AzureKeyCredential(
|
|
59
|
+
self.access_config.get_secret_value().azure_cognitive_search_key
|
|
60
|
+
),
|
|
54
61
|
)
|
|
55
62
|
|
|
56
63
|
|
|
57
|
-
@dataclass
|
|
58
64
|
class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
|
|
59
65
|
pass
|
|
60
66
|
|
|
61
67
|
|
|
62
|
-
@dataclass
|
|
63
68
|
class AzureCognitiveSearchUploaderConfig(UploaderConfig):
|
|
64
|
-
batch_size: int = 100
|
|
69
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
65
70
|
|
|
66
71
|
|
|
67
72
|
@dataclass
|
|
@@ -122,7 +127,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
122
127
|
elements_filepath: Path,
|
|
123
128
|
output_dir: Path,
|
|
124
129
|
output_filename: str,
|
|
125
|
-
**kwargs:
|
|
130
|
+
**kwargs: Any,
|
|
126
131
|
) -> Path:
|
|
127
132
|
with open(elements_filepath) as elements_file:
|
|
128
133
|
elements_contents = json.load(elements_file)
|
|
@@ -143,7 +148,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
143
148
|
|
|
144
149
|
@DestinationConnectionError.wrap
|
|
145
150
|
@requires_dependencies(["azure"], extras="azure-cognitive-search")
|
|
146
|
-
def write_dict(self, *args, elements_dict:
|
|
151
|
+
def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
|
|
147
152
|
import azure.core.exceptions
|
|
148
153
|
|
|
149
154
|
logger.info(
|
|
@@ -169,7 +174,8 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
169
174
|
raise WriteError(
|
|
170
175
|
", ".join(
|
|
171
176
|
[
|
|
172
|
-
f"{error.
|
|
177
|
+
f"{error.azure_cognitive_search_key}: "
|
|
178
|
+
f"[{error.status_code}] {error.error_message}"
|
|
173
179
|
for error in errors
|
|
174
180
|
],
|
|
175
181
|
),
|
|
@@ -186,7 +192,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
186
192
|
def write_dict_wrapper(self, elements_dict):
|
|
187
193
|
return self.write_dict(elements_dict=elements_dict)
|
|
188
194
|
|
|
189
|
-
def run(self, contents: list[UploadContent], **kwargs:
|
|
195
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
190
196
|
|
|
191
197
|
elements_dict = []
|
|
192
198
|
for content in contents:
|
|
@@ -207,13 +213,10 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
207
213
|
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
208
214
|
|
|
209
215
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
upload_stager=AzureCognitiveSearchUploadStager,
|
|
217
|
-
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
218
|
-
),
|
|
216
|
+
azure_cognitive_search_destination_entry = DestinationRegistryEntry(
|
|
217
|
+
connection_config=AzureCognitiveSearchConnectionConfig,
|
|
218
|
+
uploader=AzureCognitiveSearchUploader,
|
|
219
|
+
uploader_config=AzureCognitiveSearchUploaderConfig,
|
|
220
|
+
upload_stager=AzureCognitiveSearchUploadStager,
|
|
221
|
+
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
219
222
|
)
|
|
@@ -3,11 +3,11 @@ import uuid
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from datetime import date, datetime
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any,
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
11
|
from unstructured_ingest.error import DestinationConnectionError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -32,26 +32,35 @@ if TYPE_CHECKING:
|
|
|
32
32
|
CONNECTOR_TYPE = "chroma"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class ChromaAccessConfig(AccessConfig):
|
|
37
|
-
settings: Optional[
|
|
38
|
-
|
|
36
|
+
settings: Optional[dict[str, str]] = Field(
|
|
37
|
+
default=None, description="A dictionary of settings to communicate with the chroma server."
|
|
38
|
+
)
|
|
39
|
+
headers: Optional[dict[str, str]] = Field(
|
|
40
|
+
default=None, description="A dictionary of headers to send to the Chroma server."
|
|
41
|
+
)
|
|
39
42
|
|
|
40
43
|
|
|
41
|
-
@dataclass
|
|
42
44
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
43
|
-
collection_name: str
|
|
44
|
-
access_config: ChromaAccessConfig
|
|
45
|
-
path: Optional[str] =
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
45
|
+
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
|
+
access_config: Secret[ChromaAccessConfig]
|
|
47
|
+
path: Optional[str] = Field(
|
|
48
|
+
default=None, description="Location where Chroma is persisted, if not connecting via http."
|
|
49
|
+
)
|
|
50
|
+
tenant: Optional[str] = Field(
|
|
51
|
+
default="default_tenant", description="The tenant to use for this client."
|
|
52
|
+
)
|
|
53
|
+
database: Optional[str] = Field(
|
|
54
|
+
default="default_database", description="The database to use for this client."
|
|
55
|
+
)
|
|
56
|
+
host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
|
|
57
|
+
port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
|
|
58
|
+
ssl: bool = Field(
|
|
59
|
+
default=False, description="Whether to use SSL to connect to the Chroma server."
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
52
62
|
|
|
53
63
|
|
|
54
|
-
@dataclass
|
|
55
64
|
class ChromaUploadStagerConfig(UploadStagerConfig):
|
|
56
65
|
pass
|
|
57
66
|
|
|
@@ -101,9 +110,8 @@ class ChromaUploadStager(UploadStager):
|
|
|
101
110
|
return output_path
|
|
102
111
|
|
|
103
112
|
|
|
104
|
-
@dataclass
|
|
105
113
|
class ChromaUploaderConfig(UploaderConfig):
|
|
106
|
-
batch_size: int = 100
|
|
114
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
107
115
|
|
|
108
116
|
|
|
109
117
|
@dataclass
|
|
@@ -123,10 +131,11 @@ class ChromaUploader(Uploader):
|
|
|
123
131
|
def create_client(self) -> "Client":
|
|
124
132
|
import chromadb
|
|
125
133
|
|
|
134
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
126
135
|
if self.connection_config.path:
|
|
127
136
|
return chromadb.PersistentClient(
|
|
128
137
|
path=self.connection_config.path,
|
|
129
|
-
settings=
|
|
138
|
+
settings=access_config.settings,
|
|
130
139
|
tenant=self.connection_config.tenant,
|
|
131
140
|
database=self.connection_config.database,
|
|
132
141
|
)
|
|
@@ -136,8 +145,8 @@ class ChromaUploader(Uploader):
|
|
|
136
145
|
host=self.connection_config.host,
|
|
137
146
|
port=self.connection_config.port,
|
|
138
147
|
ssl=self.connection_config.ssl,
|
|
139
|
-
headers=
|
|
140
|
-
settings=
|
|
148
|
+
headers=access_config.headers,
|
|
149
|
+
settings=access_config.settings,
|
|
141
150
|
tenant=self.connection_config.tenant,
|
|
142
151
|
database=self.connection_config.database,
|
|
143
152
|
)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
UploadContent,
|
|
16
|
+
Uploader,
|
|
17
|
+
UploaderConfig,
|
|
18
|
+
UploadStager,
|
|
19
|
+
UploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
DestinationRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from couchbase.cluster import Cluster
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "couchbase"
|
|
30
|
+
SERVER_API_VERSION = "1"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CouchbaseAccessConfig(AccessConfig):
|
|
34
|
+
password: str = Field(description="The password for the Couchbase server")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CouchbaseConnectionConfig(ConnectionConfig):
|
|
38
|
+
username: str = Field(description="The username for the Couchbase server")
|
|
39
|
+
bucket: str = Field(description="The bucket to connect to on the Couchbase server")
|
|
40
|
+
connection_string: str = Field(
|
|
41
|
+
default="couchbase://localhost", description="The connection string of the Couchbase server"
|
|
42
|
+
)
|
|
43
|
+
scope: str = Field(
|
|
44
|
+
default="_default", description="The scope to connect to on the Couchbase server"
|
|
45
|
+
)
|
|
46
|
+
collection: str = Field(
|
|
47
|
+
default="_default", description="The collection to connect to on the Couchbase server"
|
|
48
|
+
)
|
|
49
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
50
|
+
access_config: Secret[CouchbaseAccessConfig]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class CouchbaseUploadStagerConfig(UploadStagerConfig):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class CouchbaseUploadStager(UploadStager):
|
|
59
|
+
upload_stager_config: CouchbaseUploadStagerConfig = field(
|
|
60
|
+
default_factory=lambda: CouchbaseUploadStagerConfig()
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def run(
|
|
64
|
+
self,
|
|
65
|
+
elements_filepath: Path,
|
|
66
|
+
output_dir: Path,
|
|
67
|
+
output_filename: str,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
) -> Path:
|
|
70
|
+
with open(elements_filepath) as elements_file:
|
|
71
|
+
elements_contents = json.load(elements_file)
|
|
72
|
+
|
|
73
|
+
output_elements = []
|
|
74
|
+
for element in elements_contents:
|
|
75
|
+
new_doc = {
|
|
76
|
+
element["element_id"]: {
|
|
77
|
+
"embedding": element.get("embeddings", None),
|
|
78
|
+
"text": element.get("text", None),
|
|
79
|
+
"metadata": element.get("metadata", None),
|
|
80
|
+
"type": element.get("type", None),
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
output_elements.append(new_doc)
|
|
84
|
+
|
|
85
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
86
|
+
with open(output_path, "w") as output_file:
|
|
87
|
+
json.dump(output_elements, output_file)
|
|
88
|
+
return output_path
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class CouchbaseUploaderConfig(UploaderConfig):
|
|
92
|
+
batch_size: int = Field(default=50, description="Number of documents to upload per batch")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class CouchbaseUploader(Uploader):
|
|
97
|
+
connection_config: CouchbaseConnectionConfig
|
|
98
|
+
upload_config: CouchbaseUploaderConfig
|
|
99
|
+
connector_type: str = CONNECTOR_TYPE
|
|
100
|
+
|
|
101
|
+
@requires_dependencies(["couchbase"], extras="couchbase")
|
|
102
|
+
def connect_to_couchbase(self) -> "Cluster":
|
|
103
|
+
from couchbase.auth import PasswordAuthenticator
|
|
104
|
+
from couchbase.cluster import Cluster
|
|
105
|
+
from couchbase.options import ClusterOptions
|
|
106
|
+
|
|
107
|
+
connection_string = self.connection_config.connection_string
|
|
108
|
+
username = self.connection_config.username
|
|
109
|
+
password = self.connection_config.access_config.get_secret_value().password
|
|
110
|
+
|
|
111
|
+
auth = PasswordAuthenticator(username, password)
|
|
112
|
+
options = ClusterOptions(auth)
|
|
113
|
+
options.apply_profile("wan_development")
|
|
114
|
+
cluster = Cluster(connection_string, options)
|
|
115
|
+
cluster.wait_until_ready(timedelta(seconds=5))
|
|
116
|
+
return cluster
|
|
117
|
+
|
|
118
|
+
def precheck(self) -> None:
|
|
119
|
+
try:
|
|
120
|
+
self.connect_to_couchbase()
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
123
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
124
|
+
|
|
125
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
126
|
+
elements = []
|
|
127
|
+
for content in contents:
|
|
128
|
+
with open(content.path) as elements_file:
|
|
129
|
+
elements.extend(json.load(elements_file))
|
|
130
|
+
|
|
131
|
+
logger.info(
|
|
132
|
+
f"writing {len(elements)} objects to destination "
|
|
133
|
+
f"bucket, {self.connection_config.bucket} "
|
|
134
|
+
f"at {self.connection_config.connection_string}",
|
|
135
|
+
)
|
|
136
|
+
cluster = self.connect_to_couchbase()
|
|
137
|
+
bucket = cluster.bucket(self.connection_config.bucket)
|
|
138
|
+
scope = bucket.scope(self.connection_config.scope)
|
|
139
|
+
collection = scope.collection(self.connection_config.collection)
|
|
140
|
+
|
|
141
|
+
for chunk in batch_generator(elements, self.upload_config.batch_size):
|
|
142
|
+
collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
couchbase_destination_entry = DestinationRegistryEntry(
|
|
146
|
+
connection_config=CouchbaseConnectionConfig,
|
|
147
|
+
uploader=CouchbaseUploader,
|
|
148
|
+
uploader_config=CouchbaseUploaderConfig,
|
|
149
|
+
upload_stager=CouchbaseUploadStager,
|
|
150
|
+
upload_stager_config=CouchbaseUploadStagerConfig,
|
|
151
|
+
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
6
7
|
from unstructured_ingest.error import DestinationConnectionError
|
|
7
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
9
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -21,45 +22,99 @@ if TYPE_CHECKING:
|
|
|
21
22
|
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
@dataclass
|
|
25
25
|
class DatabricksVolumesAccessConfig(AccessConfig):
|
|
26
|
-
account_id: Optional[str] =
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
26
|
+
account_id: Optional[str] = Field(
|
|
27
|
+
default=None,
|
|
28
|
+
description="The Databricks account ID for the Databricks "
|
|
29
|
+
"accounts endpoint. Only has effect when Host is "
|
|
30
|
+
"either https://accounts.cloud.databricks.com/ (AWS), "
|
|
31
|
+
"https://accounts.azuredatabricks.net/ (Azure), "
|
|
32
|
+
"or https://accounts.gcp.databricks.com/ (GCP).",
|
|
33
|
+
)
|
|
34
|
+
username: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="The Databricks username part of basic authentication. "
|
|
37
|
+
"Only possible when Host is *.cloud.databricks.com (AWS).",
|
|
38
|
+
)
|
|
39
|
+
password: Optional[str] = Field(
|
|
40
|
+
default=None,
|
|
41
|
+
description="The Databricks password part of basic authentication. "
|
|
42
|
+
"Only possible when Host is *.cloud.databricks.com (AWS).",
|
|
43
|
+
)
|
|
44
|
+
client_id: Optional[str] = Field(default=None)
|
|
45
|
+
client_secret: Optional[str] = Field(default=None)
|
|
46
|
+
token: Optional[str] = Field(
|
|
47
|
+
default=None,
|
|
48
|
+
description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
|
|
49
|
+
"Azure Active Directory (Azure AD) token (Azure).",
|
|
50
|
+
)
|
|
32
51
|
profile: Optional[str] = None
|
|
33
|
-
azure_workspace_resource_id: Optional[str] =
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
52
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
55
|
+
"which is exchanged for a Databricks host URL.",
|
|
56
|
+
)
|
|
57
|
+
azure_client_secret: Optional[str] = Field(
|
|
58
|
+
default=None, description="The Azure AD service principal’s client secret."
|
|
59
|
+
)
|
|
60
|
+
azure_client_id: Optional[str] = Field(
|
|
61
|
+
default=None, description="The Azure AD service principal’s application ID."
|
|
62
|
+
)
|
|
63
|
+
azure_tenant_id: Optional[str] = Field(
|
|
64
|
+
default=None, description="The Azure AD service principal’s tenant ID."
|
|
65
|
+
)
|
|
66
|
+
azure_environment: Optional[str] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description="The Azure environment type for a " "specific set of API endpoints",
|
|
69
|
+
examples=["Public", "UsGov", "China", "Germany"],
|
|
70
|
+
)
|
|
71
|
+
auth_type: Optional[str] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
description="When multiple auth attributes are available in the "
|
|
74
|
+
"environment, use the auth type specified by this "
|
|
75
|
+
"argument. This argument also holds the currently "
|
|
76
|
+
"selected auth.",
|
|
77
|
+
)
|
|
39
78
|
cluster_id: Optional[str] = None
|
|
40
79
|
google_credentials: Optional[str] = None
|
|
41
80
|
google_service_account: Optional[str] = None
|
|
42
81
|
|
|
43
82
|
|
|
44
|
-
|
|
83
|
+
SecretDatabricksVolumesAccessConfig = Secret[DatabricksVolumesAccessConfig]
|
|
84
|
+
|
|
85
|
+
|
|
45
86
|
class DatabricksVolumesConnectionConfig(ConnectionConfig):
|
|
46
|
-
access_config:
|
|
47
|
-
default_factory=
|
|
87
|
+
access_config: SecretDatabricksVolumesAccessConfig = Field(
|
|
88
|
+
default_factory=lambda: SecretDatabricksVolumesAccessConfig(
|
|
89
|
+
secret_value=DatabricksVolumesAccessConfig()
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
host: Optional[str] = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="The Databricks host URL for either the "
|
|
95
|
+
"Databricks workspace endpoint or the "
|
|
96
|
+
"Databricks accounts endpoint.",
|
|
48
97
|
)
|
|
49
|
-
host: Optional[str] = None
|
|
50
98
|
|
|
51
99
|
|
|
52
|
-
@dataclass
|
|
53
100
|
class DatabricksVolumesUploaderConfig(UploaderConfig):
|
|
54
|
-
volume: str
|
|
55
|
-
catalog: str
|
|
56
|
-
volume_path: Optional[str] =
|
|
57
|
-
|
|
58
|
-
|
|
101
|
+
volume: str = Field(description="Name of volume in the Unity Catalog")
|
|
102
|
+
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
103
|
+
volume_path: Optional[str] = Field(
|
|
104
|
+
default=None, description="Optional path within the volume to write to"
|
|
105
|
+
)
|
|
106
|
+
overwrite: bool = Field(
|
|
107
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
108
|
+
)
|
|
109
|
+
databricks_schema: str = Field(
|
|
110
|
+
default="default",
|
|
111
|
+
alias="schema",
|
|
112
|
+
description="Schema associated with the volume to write to in the Unity Catalog service",
|
|
113
|
+
)
|
|
59
114
|
|
|
60
115
|
@property
|
|
61
116
|
def path(self) -> str:
|
|
62
|
-
path = f"/Volumes/{self.catalog}/{self.
|
|
117
|
+
path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
|
|
63
118
|
if self.volume_path:
|
|
64
119
|
path = f"{path}/{self.volume_path}"
|
|
65
120
|
return path
|
|
@@ -70,19 +125,19 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
70
125
|
connector_type: str = CONNECTOR_TYPE
|
|
71
126
|
upload_config: DatabricksVolumesUploaderConfig
|
|
72
127
|
connection_config: DatabricksVolumesConnectionConfig
|
|
73
|
-
client: Optional["WorkspaceClient"] = field(init=False, default=None)
|
|
74
128
|
|
|
75
129
|
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
76
|
-
def
|
|
130
|
+
def get_client(self) -> "WorkspaceClient":
|
|
77
131
|
from databricks.sdk import WorkspaceClient
|
|
78
132
|
|
|
79
|
-
|
|
80
|
-
host=self.connection_config.host,
|
|
133
|
+
return WorkspaceClient(
|
|
134
|
+
host=self.connection_config.host,
|
|
135
|
+
**self.connection_config.access_config.get_secret_value().dict(),
|
|
81
136
|
)
|
|
82
137
|
|
|
83
138
|
def precheck(self) -> None:
|
|
84
139
|
try:
|
|
85
|
-
assert self.
|
|
140
|
+
assert self.get_client().current_user.me().active
|
|
86
141
|
except Exception as e:
|
|
87
142
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
88
143
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -91,7 +146,7 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
91
146
|
for content in contents:
|
|
92
147
|
with open(content.path, "rb") as elements_file:
|
|
93
148
|
output_path = os.path.join(self.upload_config.path, content.path.name)
|
|
94
|
-
self.
|
|
149
|
+
self.get_client().files.upload(
|
|
95
150
|
file_path=output_path,
|
|
96
151
|
contents=elements_file,
|
|
97
152
|
overwrite=self.upload_config.overwrite,
|