unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
@@ -82,16 +83,18 @@ class GitLabConnectionConfig(ConnectionConfig):
|
|
|
82
83
|
|
|
83
84
|
@SourceConnectionError.wrap
|
|
84
85
|
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
85
|
-
|
|
86
|
+
@contextmanager
|
|
87
|
+
def get_client(self) -> Generator["Gitlab", None, None]:
|
|
86
88
|
from gitlab import Gitlab
|
|
87
89
|
|
|
88
90
|
logger.info(f"Connection to GitLab: {self.base_url!r}")
|
|
89
|
-
|
|
91
|
+
with Gitlab(
|
|
90
92
|
self.base_url, private_token=self.access_config.get_secret_value().access_token
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
+
) as client:
|
|
94
|
+
yield client
|
|
93
95
|
|
|
94
|
-
|
|
96
|
+
@contextmanager
|
|
97
|
+
def get_project(self) -> Generator["Project", None, None]:
|
|
95
98
|
"""Retrieves the specified GitLab project using the configured base URL and access token.
|
|
96
99
|
|
|
97
100
|
Returns:
|
|
@@ -101,13 +104,12 @@ class GitLabConnectionConfig(ConnectionConfig):
|
|
|
101
104
|
SourceConnectionError: If the GitLab API connection fails.
|
|
102
105
|
gitlab.exceptions.GitlabGetError: If the project is not found.
|
|
103
106
|
"""
|
|
104
|
-
|
|
107
|
+
with self.get_client() as client:
|
|
108
|
+
logger.info(f"Accessing Project: '{self.repo_path}'")
|
|
109
|
+
project = client.projects.get(self.repo_path)
|
|
105
110
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
logger.info(f"Successfully accessed project '{self.repo_path}'")
|
|
110
|
-
return project
|
|
111
|
+
logger.info(f"Successfully accessed project '{self.repo_path}'")
|
|
112
|
+
yield project
|
|
111
113
|
|
|
112
114
|
|
|
113
115
|
class GitLabIndexerConfig(IndexerConfig):
|
|
@@ -144,11 +146,11 @@ class GitLabIndexer(Indexer):
|
|
|
144
146
|
"""
|
|
145
147
|
|
|
146
148
|
try:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
149
|
+
with self.connection_config.get_client() as client:
|
|
150
|
+
if self.connection_config.access_config.get_secret_value().access_token is not None:
|
|
151
|
+
client.auth()
|
|
152
|
+
else:
|
|
153
|
+
client.projects.get(self.connection_config.repo_path)
|
|
152
154
|
|
|
153
155
|
except Exception as e:
|
|
154
156
|
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
@@ -168,17 +170,16 @@ class GitLabIndexer(Indexer):
|
|
|
168
170
|
FileData: A generator that yields `FileData` objects representing each file (blob)
|
|
169
171
|
in the repository.
|
|
170
172
|
"""
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
)
|
|
173
|
+
with self.connection_config.get_project() as project:
|
|
174
|
+
ref = self.index_config.git_branch or project.default_branch
|
|
175
|
+
|
|
176
|
+
files = project.repository_tree(
|
|
177
|
+
path=str(self.index_config.path),
|
|
178
|
+
ref=ref,
|
|
179
|
+
recursive=self.index_config.recursive,
|
|
180
|
+
iterator=True,
|
|
181
|
+
all=True,
|
|
182
|
+
)
|
|
182
183
|
|
|
183
184
|
for file in files:
|
|
184
185
|
relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
|
|
@@ -250,12 +251,12 @@ class GitLabDownloader(Downloader):
|
|
|
250
251
|
|
|
251
252
|
ref = file_data.metadata.record_locator["ref"]
|
|
252
253
|
path = file_data.metadata.record_locator["file_path"]
|
|
253
|
-
|
|
254
|
-
project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
|
|
255
254
|
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
256
255
|
|
|
257
|
-
with
|
|
258
|
-
|
|
256
|
+
with self.connection_config.get_project() as project:
|
|
257
|
+
project_file = project.files.get(file_path=path, ref=ref)
|
|
258
|
+
with open(download_path, "wb") as file:
|
|
259
|
+
file.write(project_file.decode())
|
|
259
260
|
|
|
260
261
|
|
|
261
262
|
gitlab_source_entry = SourceRegistryEntry(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
@@ -74,7 +75,8 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
|
74
75
|
access_config: Secret[GoogleDriveAccessConfig]
|
|
75
76
|
|
|
76
77
|
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
77
|
-
|
|
78
|
+
@contextmanager
|
|
79
|
+
def get_client(self) -> Generator["GoogleAPIResource", None, None]:
|
|
78
80
|
from google.auth import exceptions
|
|
79
81
|
from google.oauth2 import service_account
|
|
80
82
|
from googleapiclient.discovery import build
|
|
@@ -86,8 +88,8 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
|
86
88
|
try:
|
|
87
89
|
creds = service_account.Credentials.from_service_account_info(key_data)
|
|
88
90
|
service = build("drive", "v3", credentials=creds)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
with service.files() as client:
|
|
92
|
+
yield client
|
|
91
93
|
except HttpError as exc:
|
|
92
94
|
raise ValueError(f"{exc.reason}")
|
|
93
95
|
except exceptions.DefaultCredentialsError:
|
|
@@ -132,7 +134,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
132
134
|
|
|
133
135
|
def precheck(self) -> None:
|
|
134
136
|
try:
|
|
135
|
-
self.connection_config.
|
|
137
|
+
self.connection_config.get_client()
|
|
136
138
|
except Exception as e:
|
|
137
139
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
138
140
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -266,13 +268,14 @@ class GoogleDriveIndexer(Indexer):
|
|
|
266
268
|
return data
|
|
267
269
|
|
|
268
270
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
271
|
+
with self.connection_config.get_client() as client:
|
|
272
|
+
for f in self.get_files(
|
|
273
|
+
files_client=client,
|
|
274
|
+
object_id=self.connection_config.drive_id,
|
|
275
|
+
recursive=self.index_config.recursive,
|
|
276
|
+
extensions=self.index_config.extensions,
|
|
277
|
+
):
|
|
278
|
+
yield f
|
|
276
279
|
|
|
277
280
|
|
|
278
281
|
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
@@ -309,30 +312,30 @@ class GoogleDriveDownloader(Downloader):
|
|
|
309
312
|
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
|
|
310
313
|
mime_type = file_data.additional_metadata["mimeType"]
|
|
311
314
|
record_id = file_data.identifier
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
)
|
|
317
|
-
if not export_mime:
|
|
318
|
-
raise TypeError(
|
|
319
|
-
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
320
|
-
f"ID: {record_id} "
|
|
321
|
-
f"MimeType: {mime_type}"
|
|
315
|
+
with self.connection_config.get_client() as client:
|
|
316
|
+
if mime_type.startswith("application/vnd.google-apps"):
|
|
317
|
+
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
|
|
318
|
+
self.meta.get("mimeType"), # type: ignore
|
|
322
319
|
)
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
320
|
+
if not export_mime:
|
|
321
|
+
raise TypeError(
|
|
322
|
+
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
323
|
+
f"ID: {record_id} "
|
|
324
|
+
f"MimeType: {mime_type}"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
request = client.export_media(
|
|
328
|
+
fileId=record_id,
|
|
329
|
+
mimeType=export_mime,
|
|
330
|
+
)
|
|
331
|
+
else:
|
|
332
|
+
request = client.get_media(fileId=record_id)
|
|
330
333
|
|
|
331
334
|
file_contents = io.BytesIO()
|
|
332
335
|
downloader = MediaIoBaseDownload(file_contents, request)
|
|
333
336
|
downloaded = self._get_content(downloader=downloader)
|
|
334
337
|
if not downloaded or not file_contents:
|
|
335
|
-
|
|
338
|
+
raise SourceConnectionError("nothing found to download")
|
|
336
339
|
return self._write_file(file_data=file_data, file_contents=file_contents)
|
|
337
340
|
|
|
338
341
|
|
|
@@ -257,8 +257,6 @@ class KafkaUploader(Uploader, ABC):
|
|
|
257
257
|
if failed_producer:
|
|
258
258
|
raise KafkaException("failed to produce all messages in batch")
|
|
259
259
|
|
|
260
|
-
def
|
|
261
|
-
|
|
262
|
-
elements = json.load(elements_file)
|
|
263
|
-
for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
|
|
260
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
261
|
+
for element_batch in batch_generator(data, batch_size=self.upload_config.batch_size):
|
|
264
262
|
self.produce_batch(elements=element_batch)
|
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
from contextlib import contextmanager
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
6
|
import pandas as pd
|
|
8
7
|
from pydantic import Field, Secret
|
|
9
8
|
|
|
10
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
11
|
-
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
10
|
+
from unstructured_ingest.utils.data_prep import flatten_dict, get_data_df, split_dataframe
|
|
12
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
12
|
from unstructured_ingest.v2.interfaces import (
|
|
14
13
|
AccessConfig,
|
|
@@ -48,12 +47,19 @@ class KdbaiConnectionConfig(ConnectionConfig):
|
|
|
48
47
|
)
|
|
49
48
|
|
|
50
49
|
@requires_dependencies(["kdbai_client"], extras="kdbai")
|
|
51
|
-
|
|
50
|
+
@contextmanager
|
|
51
|
+
def get_client(self) -> Generator["Session", None, None]:
|
|
52
52
|
from kdbai_client import Session
|
|
53
53
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
session = None
|
|
55
|
+
try:
|
|
56
|
+
session = Session(
|
|
57
|
+
api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
|
|
58
|
+
)
|
|
59
|
+
yield session
|
|
60
|
+
finally:
|
|
61
|
+
if session:
|
|
62
|
+
session.close()
|
|
57
63
|
|
|
58
64
|
|
|
59
65
|
class KdbaiUploadStagerConfig(UploadStagerConfig):
|
|
@@ -64,38 +70,19 @@ class KdbaiUploadStagerConfig(UploadStagerConfig):
|
|
|
64
70
|
class KdbaiUploadStager(UploadStager):
|
|
65
71
|
upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
|
|
66
72
|
|
|
67
|
-
def
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
data = []
|
|
81
|
-
for element in elements_contents:
|
|
82
|
-
data.append(
|
|
83
|
-
{
|
|
84
|
-
"id": get_enhanced_element_id(element_dict=element, file_data=file_data),
|
|
85
|
-
"element_id": element.get("element_id"),
|
|
86
|
-
"document": element.pop("text", None),
|
|
87
|
-
"embeddings": element.get("embeddings"),
|
|
88
|
-
"metadata": flatten_dict(
|
|
89
|
-
dictionary=element.get("metadata"),
|
|
90
|
-
flatten_lists=True,
|
|
91
|
-
remove_none=True,
|
|
92
|
-
),
|
|
93
|
-
}
|
|
94
|
-
)
|
|
95
|
-
logger.debug(f"writing {len(data)} elements to {output_path}")
|
|
96
|
-
with output_path.open("w") as output_file:
|
|
97
|
-
json.dump(data, output_file, indent=2)
|
|
98
|
-
return output_path
|
|
73
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
74
|
+
data = element_dict.copy()
|
|
75
|
+
return {
|
|
76
|
+
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
77
|
+
"element_id": data.get("element_id"),
|
|
78
|
+
"document": data.pop("text", None),
|
|
79
|
+
"embeddings": data.get("embeddings"),
|
|
80
|
+
"metadata": flatten_dict(
|
|
81
|
+
dictionary=data.get("metadata"),
|
|
82
|
+
flatten_lists=True,
|
|
83
|
+
remove_none=True,
|
|
84
|
+
),
|
|
85
|
+
}
|
|
99
86
|
|
|
100
87
|
|
|
101
88
|
class KdbaiUploaderConfig(UploaderConfig):
|
|
@@ -119,50 +106,37 @@ class KdbaiUploader(Uploader):
|
|
|
119
106
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
120
107
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
121
108
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
109
|
+
@contextmanager
|
|
110
|
+
def get_database(self) -> Generator["Database", None, None]:
|
|
111
|
+
with self.connection_config.get_client() as client:
|
|
112
|
+
db = client.database(self.upload_config.database_name)
|
|
113
|
+
yield db
|
|
126
114
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
115
|
+
@contextmanager
|
|
116
|
+
def get_table(self) -> Generator["Table", None, None]:
|
|
117
|
+
with self.get_database() as db:
|
|
118
|
+
table = db.table(self.upload_config.table_name)
|
|
119
|
+
yield table
|
|
131
120
|
|
|
132
121
|
def upsert_batch(self, batch: pd.DataFrame):
|
|
133
|
-
|
|
134
|
-
|
|
122
|
+
with self.get_table() as table:
|
|
123
|
+
table.insert(batch)
|
|
135
124
|
|
|
136
125
|
def process_dataframe(self, df: pd.DataFrame):
|
|
137
126
|
logger.debug(
|
|
138
127
|
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
139
128
|
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
140
129
|
)
|
|
141
|
-
for
|
|
130
|
+
for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
142
131
|
self.upsert_batch(batch=batch_df)
|
|
143
132
|
|
|
144
|
-
def
|
|
145
|
-
|
|
146
|
-
df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
|
|
147
|
-
self.process_dataframe(df=df)
|
|
148
|
-
|
|
149
|
-
def process_json(self, json_paths: list[Path]):
|
|
150
|
-
logger.debug(f"uploading content from {len(json_paths)} json files")
|
|
151
|
-
all_records = []
|
|
152
|
-
for p in json_paths:
|
|
153
|
-
with open(p) as json_file:
|
|
154
|
-
all_records.extend(json.load(json_file))
|
|
155
|
-
|
|
156
|
-
df = pd.DataFrame(data=all_records)
|
|
133
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
134
|
+
df = pd.DataFrame(data=data)
|
|
157
135
|
self.process_dataframe(df=df)
|
|
158
136
|
|
|
159
137
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
elif path.suffix == ".json":
|
|
163
|
-
self.process_json(json_paths=[path])
|
|
164
|
-
else:
|
|
165
|
-
raise ValueError(f"Unsupported file type, must be json or csv file: {path}")
|
|
138
|
+
data = get_data_df(path=path)
|
|
139
|
+
self.process_dataframe(df=data)
|
|
166
140
|
|
|
167
141
|
|
|
168
142
|
kdbai_destination_entry = DestinationRegistryEntry(
|
|
@@ -41,14 +41,11 @@ class LanceDBConnectionConfig(ConnectionConfig, ABC):
|
|
|
41
41
|
async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
|
|
42
42
|
import lancedb
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
with await lancedb.connect_async(
|
|
45
45
|
self.uri,
|
|
46
46
|
storage_options=self.get_storage_options(),
|
|
47
|
-
)
|
|
48
|
-
try:
|
|
47
|
+
) as connection:
|
|
49
48
|
yield connection
|
|
50
|
-
finally:
|
|
51
|
-
connection.close()
|
|
52
49
|
|
|
53
50
|
|
|
54
51
|
class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
|
|
@@ -85,8 +82,8 @@ class LanceDBUploadStager(UploadStager):
|
|
|
85
82
|
|
|
86
83
|
df = pd.DataFrame(
|
|
87
84
|
[
|
|
88
|
-
self.
|
|
89
|
-
for
|
|
85
|
+
self.conform_dict(element_dict=element_dict, file_data=file_data)
|
|
86
|
+
for element_dict in elements_contents
|
|
90
87
|
]
|
|
91
88
|
)
|
|
92
89
|
|
|
@@ -95,11 +92,12 @@ class LanceDBUploadStager(UploadStager):
|
|
|
95
92
|
|
|
96
93
|
return output_path
|
|
97
94
|
|
|
98
|
-
def
|
|
95
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
96
|
+
data = element_dict.copy()
|
|
99
97
|
return {
|
|
100
|
-
"vector":
|
|
98
|
+
"vector": data.pop("embeddings", None),
|
|
101
99
|
RECORD_ID_LABEL: file_data.identifier,
|
|
102
|
-
**flatten_dict(
|
|
100
|
+
**flatten_dict(data, separator="-"),
|
|
103
101
|
}
|
|
104
102
|
|
|
105
103
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import glob
|
|
2
|
+
import json
|
|
2
3
|
import shutil
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
@@ -175,7 +176,7 @@ class LocalUploader(Uploader):
|
|
|
175
176
|
def is_async(self) -> bool:
|
|
176
177
|
return False
|
|
177
178
|
|
|
178
|
-
def
|
|
179
|
+
def get_destination_path(self, file_data: FileData) -> Path:
|
|
179
180
|
if source_identifiers := file_data.source_identifiers:
|
|
180
181
|
rel_path = (
|
|
181
182
|
source_identifiers.relative_path[1:]
|
|
@@ -188,7 +189,17 @@ class LocalUploader(Uploader):
|
|
|
188
189
|
)
|
|
189
190
|
else:
|
|
190
191
|
final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
|
|
191
|
-
Path(final_path)
|
|
192
|
+
final_path = Path(final_path)
|
|
193
|
+
final_path.parent.mkdir(parents=True, exist_ok=True)
|
|
194
|
+
return final_path
|
|
195
|
+
|
|
196
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
197
|
+
final_path = self.get_destination_path(file_data=file_data)
|
|
198
|
+
with final_path.open("w") as f:
|
|
199
|
+
json.dump(data, f)
|
|
200
|
+
|
|
201
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
202
|
+
final_path = self.get_destination_path(file_data=file_data)
|
|
192
203
|
logger.debug(f"copying file from {path} to {final_path}")
|
|
193
204
|
shutil.copy(src=str(path), dst=str(final_path))
|
|
194
205
|
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
6
5
|
|
|
7
|
-
import pandas as pd
|
|
8
6
|
from dateutil import parser
|
|
9
7
|
from pydantic import Field, Secret
|
|
10
8
|
|
|
@@ -16,7 +14,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
14
|
AccessConfig,
|
|
17
15
|
ConnectionConfig,
|
|
18
16
|
FileData,
|
|
19
|
-
UploadContent,
|
|
20
17
|
Uploader,
|
|
21
18
|
UploaderConfig,
|
|
22
19
|
UploadStager,
|
|
@@ -59,10 +56,17 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
59
56
|
return connection_config_dict
|
|
60
57
|
|
|
61
58
|
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
62
|
-
|
|
59
|
+
@contextmanager
|
|
60
|
+
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
63
61
|
from pymilvus import MilvusClient
|
|
64
62
|
|
|
65
|
-
|
|
63
|
+
client = None
|
|
64
|
+
try:
|
|
65
|
+
client = MilvusClient(**self.get_connection_kwargs())
|
|
66
|
+
yield client
|
|
67
|
+
finally:
|
|
68
|
+
if client:
|
|
69
|
+
client.close()
|
|
66
70
|
|
|
67
71
|
|
|
68
72
|
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
@@ -91,8 +95,8 @@ class MilvusUploadStager(UploadStager):
|
|
|
91
95
|
pass
|
|
92
96
|
return parser.parse(date_string).timestamp()
|
|
93
97
|
|
|
94
|
-
def conform_dict(self,
|
|
95
|
-
working_data =
|
|
98
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
99
|
+
working_data = element_dict.copy()
|
|
96
100
|
if self.upload_stager_config.flatten_metadata and (
|
|
97
101
|
metadata := working_data.pop("metadata", None)
|
|
98
102
|
):
|
|
@@ -134,29 +138,6 @@ class MilvusUploadStager(UploadStager):
|
|
|
134
138
|
working_data[RECORD_ID_LABEL] = file_data.identifier
|
|
135
139
|
return working_data
|
|
136
140
|
|
|
137
|
-
def run(
|
|
138
|
-
self,
|
|
139
|
-
elements_filepath: Path,
|
|
140
|
-
file_data: FileData,
|
|
141
|
-
output_dir: Path,
|
|
142
|
-
output_filename: str,
|
|
143
|
-
**kwargs: Any,
|
|
144
|
-
) -> Path:
|
|
145
|
-
with open(elements_filepath) as elements_file:
|
|
146
|
-
elements_contents: list[dict[str, Any]] = json.load(elements_file)
|
|
147
|
-
new_content = [
|
|
148
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
149
|
-
]
|
|
150
|
-
output_filename_path = Path(output_filename)
|
|
151
|
-
if output_filename_path.suffix == ".json":
|
|
152
|
-
output_path = Path(output_dir) / output_filename_path
|
|
153
|
-
else:
|
|
154
|
-
output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
|
|
155
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
156
|
-
with output_path.open("w") as output_file:
|
|
157
|
-
json.dump(new_content, output_file, indent=2)
|
|
158
|
-
return output_path
|
|
159
|
-
|
|
160
141
|
|
|
161
142
|
class MilvusUploaderConfig(UploaderConfig):
|
|
162
143
|
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
@@ -183,22 +164,10 @@ class MilvusUploader(Uploader):
|
|
|
183
164
|
|
|
184
165
|
@contextmanager
|
|
185
166
|
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
try:
|
|
167
|
+
with self.connection_config.get_client() as client:
|
|
168
|
+
if db_name := self.upload_config.db_name:
|
|
169
|
+
client.using_database(db_name=db_name)
|
|
190
170
|
yield client
|
|
191
|
-
finally:
|
|
192
|
-
client.close()
|
|
193
|
-
|
|
194
|
-
def upload(self, content: UploadContent) -> None:
|
|
195
|
-
file_extension = content.path.suffix
|
|
196
|
-
if file_extension == ".json":
|
|
197
|
-
self.upload_json(content=content)
|
|
198
|
-
elif file_extension == ".csv":
|
|
199
|
-
self.upload_csv(content=content)
|
|
200
|
-
else:
|
|
201
|
-
raise ValueError(f"Unsupported file extension: {file_extension}")
|
|
202
171
|
|
|
203
172
|
def delete_by_record_id(self, file_data: FileData) -> None:
|
|
204
173
|
logger.info(
|
|
@@ -233,19 +202,9 @@ class MilvusUploader(Uploader):
|
|
|
233
202
|
err_count = res["err_count"]
|
|
234
203
|
raise WriteError(f"failed to upload {err_count} docs")
|
|
235
204
|
|
|
236
|
-
def
|
|
237
|
-
df = pd.read_csv(content.path)
|
|
238
|
-
data = df.to_dict(orient="records")
|
|
239
|
-
self.insert_results(data=data)
|
|
240
|
-
|
|
241
|
-
def upload_json(self, content: UploadContent) -> None:
|
|
242
|
-
with content.path.open("r") as file:
|
|
243
|
-
data: list[dict] = json.load(file)
|
|
244
|
-
self.insert_results(data=data)
|
|
245
|
-
|
|
246
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
205
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
247
206
|
self.delete_by_record_id(file_data=file_data)
|
|
248
|
-
self.
|
|
207
|
+
self.insert_results(data=data)
|
|
249
208
|
|
|
250
209
|
|
|
251
210
|
milvus_destination_entry = DestinationRegistryEntry(
|