unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_singlestore.py +156 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +67 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/docker_compose.py +23 -8
- test/integration/connectors/utils/validation.py +73 -22
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/v2/interfaces/file_data.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +15 -7
- unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -5
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +2 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +15 -6
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +25 -11
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +17 -17
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +43 -27
- unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
9
9
|
from dateutil import parser
|
|
10
10
|
from pydantic import Field, Secret
|
|
11
11
|
|
|
12
|
-
from unstructured_ingest.error import
|
|
12
|
+
from unstructured_ingest.error import (
|
|
13
|
+
DestinationConnectionError,
|
|
14
|
+
SourceConnectionError,
|
|
15
|
+
SourceConnectionNetworkError,
|
|
16
|
+
)
|
|
13
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
18
|
from unstructured_ingest.v2.interfaces import (
|
|
15
19
|
AccessConfig,
|
|
@@ -22,16 +26,20 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
26
|
Indexer,
|
|
23
27
|
IndexerConfig,
|
|
24
28
|
SourceIdentifiers,
|
|
29
|
+
Uploader,
|
|
30
|
+
UploaderConfig,
|
|
25
31
|
download_responses,
|
|
26
32
|
)
|
|
27
33
|
from unstructured_ingest.v2.logger import logger
|
|
28
34
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
35
|
+
DestinationRegistryEntry,
|
|
29
36
|
SourceRegistryEntry,
|
|
30
37
|
)
|
|
31
38
|
|
|
32
39
|
if TYPE_CHECKING:
|
|
33
40
|
from office365.graph_client import GraphClient
|
|
34
41
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
42
|
+
from office365.onedrive.drives.drive import Drive
|
|
35
43
|
|
|
36
44
|
CONNECTOR_TYPE = "onedrive"
|
|
37
45
|
MAX_MB_SIZE = 512_000_000
|
|
@@ -55,6 +63,11 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
55
63
|
)
|
|
56
64
|
access_config: Secret[OnedriveAccessConfig]
|
|
57
65
|
|
|
66
|
+
def get_drive(self) -> "Drive":
|
|
67
|
+
client = self.get_client()
|
|
68
|
+
drive = client.users[self.user_pname].drive
|
|
69
|
+
return drive
|
|
70
|
+
|
|
58
71
|
@requires_dependencies(["msal"], extras="onedrive")
|
|
59
72
|
def get_token(self):
|
|
60
73
|
from msal import ConfidentialClientApplication
|
|
@@ -100,7 +113,6 @@ class OnedriveIndexer(Indexer):
|
|
|
100
113
|
raise SourceConnectionError(
|
|
101
114
|
"{} ({})".format(error, token_resp.get("error_description"))
|
|
102
115
|
)
|
|
103
|
-
self.connection_config.get_client()
|
|
104
116
|
except Exception as e:
|
|
105
117
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
106
118
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -224,6 +236,149 @@ class OnedriveDownloader(Downloader):
|
|
|
224
236
|
return DownloadResponse(file_data=file_data, path=download_path)
|
|
225
237
|
|
|
226
238
|
|
|
239
|
+
class OnedriveUploaderConfig(UploaderConfig):
|
|
240
|
+
remote_url: str = Field(
|
|
241
|
+
description="URL of the destination in OneDrive, e.g., 'onedrive://Documents/Folder'"
|
|
242
|
+
)
|
|
243
|
+
prefix: str = "onedrive://"
|
|
244
|
+
|
|
245
|
+
@property
|
|
246
|
+
def root_folder(self) -> str:
|
|
247
|
+
url = (
|
|
248
|
+
self.remote_url.replace(self.prefix, "", 1)
|
|
249
|
+
if self.remote_url.startswith(self.prefix)
|
|
250
|
+
else self.remote_url
|
|
251
|
+
)
|
|
252
|
+
return url.split("/")[0]
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def url(self) -> str:
|
|
256
|
+
url = (
|
|
257
|
+
self.remote_url.replace(self.prefix, "", 1)
|
|
258
|
+
if self.remote_url.startswith(self.prefix)
|
|
259
|
+
else self.remote_url
|
|
260
|
+
)
|
|
261
|
+
return url
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@dataclass
|
|
265
|
+
class OnedriveUploader(Uploader):
|
|
266
|
+
connection_config: OnedriveConnectionConfig
|
|
267
|
+
upload_config: OnedriveUploaderConfig
|
|
268
|
+
connector_type: str = CONNECTOR_TYPE
|
|
269
|
+
|
|
270
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
271
|
+
def precheck(self) -> None:
|
|
272
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
token_resp: dict = self.connection_config.get_token()
|
|
276
|
+
if error := token_resp.get("error"):
|
|
277
|
+
raise SourceConnectionError(
|
|
278
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
279
|
+
)
|
|
280
|
+
drive = self.connection_config.get_drive()
|
|
281
|
+
root = drive.root
|
|
282
|
+
root_folder = self.upload_config.root_folder
|
|
283
|
+
folder = root.get_by_path(root_folder)
|
|
284
|
+
try:
|
|
285
|
+
folder.get().execute_query()
|
|
286
|
+
except ClientRequestException as e:
|
|
287
|
+
if e.message != "The resource could not be found.":
|
|
288
|
+
raise e
|
|
289
|
+
folder = root.create_folder(root_folder).execute_query()
|
|
290
|
+
logger.info(f"successfully created folder: {folder.name}")
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
293
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
294
|
+
|
|
295
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
296
|
+
drive = self.connection_config.get_drive()
|
|
297
|
+
|
|
298
|
+
# Use the remote_url from upload_config as the base destination folder
|
|
299
|
+
base_destination_folder = self.upload_config.url
|
|
300
|
+
|
|
301
|
+
# Use the file's relative path to maintain directory structure, if needed
|
|
302
|
+
if file_data.source_identifiers and file_data.source_identifiers.rel_path:
|
|
303
|
+
# Combine the base destination folder with the file's relative path
|
|
304
|
+
destination_path = Path(base_destination_folder) / Path(
|
|
305
|
+
file_data.source_identifiers.rel_path
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
# If no relative path is provided, upload directly to the base destination folder
|
|
309
|
+
destination_path = Path(base_destination_folder) / path.name
|
|
310
|
+
|
|
311
|
+
destination_folder = destination_path.parent
|
|
312
|
+
file_name = destination_path.name
|
|
313
|
+
|
|
314
|
+
# Convert destination folder to a string suitable for OneDrive API
|
|
315
|
+
destination_folder_str = str(destination_folder).replace("\\", "/")
|
|
316
|
+
|
|
317
|
+
# Resolve the destination folder in OneDrive, creating it if necessary
|
|
318
|
+
try:
|
|
319
|
+
# Attempt to get the folder
|
|
320
|
+
folder = drive.root.get_by_path(destination_folder_str)
|
|
321
|
+
folder.get().execute_query()
|
|
322
|
+
except Exception:
|
|
323
|
+
# Folder doesn't exist, create it recursively
|
|
324
|
+
current_folder = drive.root
|
|
325
|
+
for part in destination_folder.parts:
|
|
326
|
+
# Use filter to find the folder by name
|
|
327
|
+
folders = (
|
|
328
|
+
current_folder.children.filter(f"name eq '{part}' and folder ne null")
|
|
329
|
+
.get()
|
|
330
|
+
.execute_query()
|
|
331
|
+
)
|
|
332
|
+
if folders:
|
|
333
|
+
current_folder = folders[0]
|
|
334
|
+
else:
|
|
335
|
+
# Folder doesn't exist, create it
|
|
336
|
+
current_folder = current_folder.create_folder(part).execute_query()
|
|
337
|
+
folder = current_folder
|
|
338
|
+
|
|
339
|
+
# Check the size of the file
|
|
340
|
+
file_size = path.stat().st_size
|
|
341
|
+
|
|
342
|
+
if file_size < MAX_MB_SIZE:
|
|
343
|
+
# Use simple upload for small files
|
|
344
|
+
with path.open("rb") as local_file:
|
|
345
|
+
content = local_file.read()
|
|
346
|
+
logger.info(f"Uploading {path} to {destination_path} using simple upload")
|
|
347
|
+
try:
|
|
348
|
+
uploaded_file = folder.upload(file_name, content).execute_query()
|
|
349
|
+
if not uploaded_file or uploaded_file.name != file_name:
|
|
350
|
+
raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
|
|
351
|
+
# Log details about the uploaded file
|
|
352
|
+
logger.info(
|
|
353
|
+
f"Uploaded file '{uploaded_file.name}' with ID '{uploaded_file.id}'"
|
|
354
|
+
)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.error(f"Failed to upload file '{file_name}': {e}", exc_info=True)
|
|
357
|
+
raise DestinationConnectionError(
|
|
358
|
+
f"Failed to upload file '{file_name}': {e}"
|
|
359
|
+
) from e
|
|
360
|
+
else:
|
|
361
|
+
# Use resumable upload for large files
|
|
362
|
+
destination_fullpath = f"{destination_folder_str}/{file_name}"
|
|
363
|
+
destination_drive_item = drive.root.item_with_path(destination_fullpath)
|
|
364
|
+
|
|
365
|
+
logger.info(f"Uploading {path} to {destination_fullpath} using resumable upload")
|
|
366
|
+
try:
|
|
367
|
+
uploaded_file = destination_drive_item.resumable_upload(
|
|
368
|
+
source_path=str(path)
|
|
369
|
+
).execute_query()
|
|
370
|
+
# Validate the upload
|
|
371
|
+
if not uploaded_file or uploaded_file.name != file_name:
|
|
372
|
+
raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
|
|
373
|
+
# Log details about the uploaded file
|
|
374
|
+
logger.info(f"Uploaded file {uploaded_file.name} with ID {uploaded_file.id}")
|
|
375
|
+
except Exception as e:
|
|
376
|
+
logger.error(f"Failed to upload file '{file_name}' using resumable upload: {e}")
|
|
377
|
+
raise DestinationConnectionError(
|
|
378
|
+
f"Failed to upload file '{file_name}' using resumable upload: {e}"
|
|
379
|
+
) from e
|
|
380
|
+
|
|
381
|
+
|
|
227
382
|
onedrive_source_entry = SourceRegistryEntry(
|
|
228
383
|
connection_config=OnedriveConnectionConfig,
|
|
229
384
|
indexer_config=OnedriveIndexerConfig,
|
|
@@ -231,3 +386,9 @@ onedrive_source_entry = SourceRegistryEntry(
|
|
|
231
386
|
downloader_config=OnedriveDownloaderConfig,
|
|
232
387
|
downloader=OnedriveDownloader,
|
|
233
388
|
)
|
|
389
|
+
|
|
390
|
+
onedrive_destination_entry = DestinationRegistryEntry(
|
|
391
|
+
connection_config=OnedriveConnectionConfig,
|
|
392
|
+
uploader=OnedriveUploader,
|
|
393
|
+
uploader_config=OnedriveUploaderConfig,
|
|
394
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR_TYPE
|
|
8
|
+
from .cloud import qdrant_cloud_destination_entry
|
|
9
|
+
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
10
|
+
from .local import qdrant_local_destination_entry
|
|
11
|
+
from .server import CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE
|
|
12
|
+
from .server import qdrant_server_destination_entry
|
|
13
|
+
|
|
14
|
+
add_destination_entry(destination_type=CLOUD_CONNECTOR_TYPE, entry=qdrant_cloud_destination_entry)
|
|
15
|
+
add_destination_entry(destination_type=SERVER_CONNECTOR_TYPE, entry=qdrant_server_destination_entry)
|
|
16
|
+
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=qdrant_local_destination_entry)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-cloud"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CloudQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
api_key: str = Field(description="Qdrant API key")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CloudQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
url: str = Field(default=None, description="url of Qdrant Cloud")
|
|
24
|
+
access_config: Secret[CloudQdrantAccessConfig]
|
|
25
|
+
|
|
26
|
+
def get_client_kwargs(self) -> dict:
|
|
27
|
+
return {
|
|
28
|
+
"api_key": self.access_config.get_secret_value().api_key,
|
|
29
|
+
"url": self.url,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CloudQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class CloudQdrantUploadStager(QdrantUploadStager):
|
|
39
|
+
upload_stager_config: CloudQdrantUploadStagerConfig
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class CloudQdrantUploaderConfig(QdrantUploaderConfig):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CloudQdrantUploader(QdrantUploader):
|
|
48
|
+
connection_config: CloudQdrantConnectionConfig
|
|
49
|
+
upload_config: CloudQdrantUploaderConfig
|
|
50
|
+
connector_type: str = CONNECTOR_TYPE
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
qdrant_cloud_destination_entry = DestinationRegistryEntry(
|
|
54
|
+
connection_config=CloudQdrantConnectionConfig,
|
|
55
|
+
uploader=CloudQdrantUploader,
|
|
56
|
+
uploader_config=CloudQdrantUploaderConfig,
|
|
57
|
+
upload_stager=CloudQdrantUploadStager,
|
|
58
|
+
upload_stager_config=CloudQdrantUploadStagerConfig,
|
|
59
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-local"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LocalQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LocalQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
path: str = Field(default=None, description="Persistence path for QdrantLocal.")
|
|
24
|
+
access_config: Secret[LocalQdrantAccessConfig] = Field(
|
|
25
|
+
default_factory=LocalQdrantAccessConfig, validate_default=True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_client_kwargs(self) -> dict:
|
|
29
|
+
return {"path": self.path}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LocalQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class LocalQdrantUploadStager(QdrantUploadStager):
|
|
38
|
+
upload_stager_config: LocalQdrantUploadStagerConfig
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LocalQdrantUploaderConfig(QdrantUploaderConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class LocalQdrantUploader(QdrantUploader):
|
|
47
|
+
connection_config: LocalQdrantConnectionConfig
|
|
48
|
+
upload_config: LocalQdrantUploaderConfig
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
qdrant_local_destination_entry = DestinationRegistryEntry(
|
|
53
|
+
connection_config=LocalQdrantConnectionConfig,
|
|
54
|
+
uploader=LocalQdrantUploader,
|
|
55
|
+
uploader_config=LocalQdrantUploaderConfig,
|
|
56
|
+
upload_stager=LocalQdrantUploadStager,
|
|
57
|
+
upload_stager_config=LocalQdrantUploadStagerConfig,
|
|
58
|
+
)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import Field, Secret
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
13
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
14
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from qdrant_client import AsyncQdrantClient
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class QdrantAccessConfig(AccessConfig, ABC):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class QdrantConnectionConfig(ConnectionConfig, ABC):
|
|
35
|
+
access_config: Secret[QdrantAccessConfig] = Field(
|
|
36
|
+
default_factory=QdrantAccessConfig, validate_default=True, description="Access Config"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def get_client_kwargs(self) -> dict:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(["qdrant_client"], extras="qdrant")
|
|
44
|
+
@asynccontextmanager
|
|
45
|
+
async def get_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
|
|
46
|
+
from qdrant_client.async_qdrant_client import AsyncQdrantClient
|
|
47
|
+
|
|
48
|
+
client_kwargs = self.get_client_kwargs()
|
|
49
|
+
client = AsyncQdrantClient(**client_kwargs)
|
|
50
|
+
try:
|
|
51
|
+
yield client
|
|
52
|
+
finally:
|
|
53
|
+
await client.close()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class QdrantUploadStagerConfig(UploadStagerConfig):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class QdrantUploadStager(UploadStager, ABC):
|
|
62
|
+
upload_stager_config: QdrantUploadStagerConfig = field(
|
|
63
|
+
default_factory=lambda: QdrantUploadStagerConfig()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def conform_dict(data: dict) -> dict:
|
|
68
|
+
"""Prepares dictionary in the format that Chroma requires"""
|
|
69
|
+
return {
|
|
70
|
+
"id": str(uuid.uuid4()),
|
|
71
|
+
"vector": data.pop("embeddings", {}),
|
|
72
|
+
"payload": {
|
|
73
|
+
"text": data.pop("text", None),
|
|
74
|
+
"element_serialized": json.dumps(data),
|
|
75
|
+
**flatten_dict(
|
|
76
|
+
data,
|
|
77
|
+
separator="-",
|
|
78
|
+
flatten_lists=True,
|
|
79
|
+
),
|
|
80
|
+
},
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def run(
|
|
84
|
+
self,
|
|
85
|
+
elements_filepath: Path,
|
|
86
|
+
file_data: FileData,
|
|
87
|
+
output_dir: Path,
|
|
88
|
+
output_filename: str,
|
|
89
|
+
**kwargs: Any,
|
|
90
|
+
) -> Path:
|
|
91
|
+
with open(elements_filepath) as elements_file:
|
|
92
|
+
elements_contents = json.load(elements_file)
|
|
93
|
+
|
|
94
|
+
conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
|
|
95
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
96
|
+
|
|
97
|
+
with open(output_path, "w") as output_file:
|
|
98
|
+
json.dump(conformed_elements, output_file)
|
|
99
|
+
return output_path
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class QdrantUploaderConfig(UploaderConfig):
|
|
103
|
+
collection_name: str = Field(description="Name of the collection.")
|
|
104
|
+
batch_size: int = Field(default=50, description="Number of records per batch.")
|
|
105
|
+
num_processes: Optional[int] = Field(
|
|
106
|
+
default=1,
|
|
107
|
+
description="Optional limit on number of threads to use for upload.",
|
|
108
|
+
deprecated=True,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class QdrantUploader(Uploader, ABC):
|
|
114
|
+
upload_config: QdrantUploaderConfig
|
|
115
|
+
connection_config: QdrantConnectionConfig
|
|
116
|
+
|
|
117
|
+
@DestinationConnectionError.wrap
|
|
118
|
+
def precheck(self) -> None:
|
|
119
|
+
async def check_connection():
|
|
120
|
+
async with self.connection_config.get_client() as async_client:
|
|
121
|
+
await async_client.get_collections()
|
|
122
|
+
|
|
123
|
+
asyncio.run(check_connection())
|
|
124
|
+
|
|
125
|
+
def is_async(self):
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
async def run_async(
|
|
129
|
+
self,
|
|
130
|
+
path: Path,
|
|
131
|
+
file_data: FileData,
|
|
132
|
+
**kwargs: Any,
|
|
133
|
+
) -> None:
|
|
134
|
+
with path.open("r") as file:
|
|
135
|
+
elements: list[dict] = json.load(file)
|
|
136
|
+
|
|
137
|
+
logger.debug("Loaded %i elements from %s", len(elements), path)
|
|
138
|
+
|
|
139
|
+
batches = list(batch_generator(elements, batch_size=self.upload_config.batch_size))
|
|
140
|
+
logger.debug(
|
|
141
|
+
"Elements split into %i batches of size %i.",
|
|
142
|
+
len(batches),
|
|
143
|
+
self.upload_config.batch_size,
|
|
144
|
+
)
|
|
145
|
+
await asyncio.gather(*[self._upsert_batch(batch) for batch in batches])
|
|
146
|
+
|
|
147
|
+
async def _upsert_batch(self, batch: list[dict]) -> None:
|
|
148
|
+
from qdrant_client import models
|
|
149
|
+
|
|
150
|
+
points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
|
|
151
|
+
try:
|
|
152
|
+
logger.debug(
|
|
153
|
+
"Upserting %i points to the '%s' collection.",
|
|
154
|
+
len(points),
|
|
155
|
+
self.upload_config.collection_name,
|
|
156
|
+
)
|
|
157
|
+
async with self.connection_config.get_client() as async_client:
|
|
158
|
+
await async_client.upsert(
|
|
159
|
+
self.upload_config.collection_name, points=points, wait=True
|
|
160
|
+
)
|
|
161
|
+
except Exception as api_error:
|
|
162
|
+
logger.error(
|
|
163
|
+
"Failed to upsert points to the collection due to the following error %s", api_error
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
raise WriteError(f"Qdrant error: {api_error}") from api_error
|
|
167
|
+
|
|
168
|
+
logger.debug("Successfully upsert points to the collection.")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-server"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ServerQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ServerQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
url: str = Field(default=None, description="url of Qdrant server")
|
|
24
|
+
access_config: Secret[ServerQdrantAccessConfig] = Field(
|
|
25
|
+
default_factory=ServerQdrantAccessConfig, validate_default=True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_client_kwargs(self) -> dict:
|
|
29
|
+
return {
|
|
30
|
+
"url": self.url,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ServerQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ServerQdrantUploadStager(QdrantUploadStager):
|
|
40
|
+
upload_stager_config: ServerQdrantUploadStagerConfig
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ServerQdrantUploaderConfig(QdrantUploaderConfig):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class ServerQdrantUploader(QdrantUploader):
|
|
49
|
+
connection_config: ServerQdrantConnectionConfig
|
|
50
|
+
upload_config: ServerQdrantUploaderConfig
|
|
51
|
+
connector_type: str = CONNECTOR_TYPE
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
qdrant_server_destination_entry = DestinationRegistryEntry(
|
|
55
|
+
connection_config=ServerQdrantConnectionConfig,
|
|
56
|
+
uploader=ServerQdrantUploader,
|
|
57
|
+
uploader_config=ServerQdrantUploaderConfig,
|
|
58
|
+
upload_stager=ServerQdrantUploadStager,
|
|
59
|
+
upload_stager_config=ServerQdrantUploadStagerConfig,
|
|
60
|
+
)
|
|
@@ -7,6 +7,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
7
7
|
|
|
8
8
|
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
9
9
|
from .postgres import postgres_destination_entry, postgres_source_entry
|
|
10
|
+
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
11
|
+
from .singlestore import singlestore_destination_entry
|
|
10
12
|
from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
|
|
11
13
|
from .snowflake import snowflake_destination_entry, snowflake_source_entry
|
|
12
14
|
from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
|
|
@@ -19,3 +21,6 @@ add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_en
|
|
|
19
21
|
add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
|
|
20
22
|
add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
|
|
21
23
|
add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_destination_entry)
|
|
24
|
+
add_destination_entry(
|
|
25
|
+
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
26
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from contextlib import contextmanager
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import TYPE_CHECKING,
|
|
3
|
+
from typing import TYPE_CHECKING, Generator, Optional
|
|
4
4
|
|
|
5
5
|
from pydantic import Field, Secret
|
|
6
6
|
|
|
@@ -12,7 +12,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
12
12
|
SourceRegistryEntry,
|
|
13
13
|
)
|
|
14
14
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
15
|
-
_DATE_COLUMNS,
|
|
16
15
|
SQLAccessConfig,
|
|
17
16
|
SQLConnectionConfig,
|
|
18
17
|
SQLDownloader,
|
|
@@ -23,7 +22,6 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
23
22
|
SQLUploaderConfig,
|
|
24
23
|
SQLUploadStager,
|
|
25
24
|
SQLUploadStagerConfig,
|
|
26
|
-
parse_date_string,
|
|
27
25
|
)
|
|
28
26
|
|
|
29
27
|
if TYPE_CHECKING:
|
|
@@ -138,23 +136,6 @@ class PostgresUploader(SQLUploader):
|
|
|
138
136
|
connector_type: str = CONNECTOR_TYPE
|
|
139
137
|
values_delimiter: str = "%s"
|
|
140
138
|
|
|
141
|
-
def prepare_data(
|
|
142
|
-
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
143
|
-
) -> list[tuple[Any, ...]]:
|
|
144
|
-
output = []
|
|
145
|
-
for row in data:
|
|
146
|
-
parsed = []
|
|
147
|
-
for column_name, value in zip(columns, row):
|
|
148
|
-
if column_name in _DATE_COLUMNS:
|
|
149
|
-
if value is None:
|
|
150
|
-
parsed.append(None)
|
|
151
|
-
else:
|
|
152
|
-
parsed.append(parse_date_string(value))
|
|
153
|
-
else:
|
|
154
|
-
parsed.append(value)
|
|
155
|
-
output.append(tuple(parsed))
|
|
156
|
-
return output
|
|
157
|
-
|
|
158
139
|
|
|
159
140
|
postgres_source_entry = SourceRegistryEntry(
|
|
160
141
|
connection_config=PostgresConnectionConfig,
|