unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/databricks/test_volumes_native.py +10 -6
- test/integration/connectors/discord/test_discord.py +4 -4
- test/integration/connectors/duckdb/test_duckdb.py +3 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
- test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
- test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
- test/integration/connectors/sql/test_postgres.py +9 -3
- test/integration/connectors/sql/test_singlestore.py +9 -3
- test/integration/connectors/sql/test_snowflake.py +9 -3
- test/integration/connectors/sql/test_sqlite.py +9 -3
- test/integration/connectors/test_astradb.py +25 -9
- test/integration/connectors/test_azure_ai_search.py +3 -4
- test/integration/connectors/test_chroma.py +4 -6
- test/integration/connectors/test_confluence.py +3 -5
- test/integration/connectors/test_delta_table.py +4 -6
- test/integration/connectors/test_lancedb.py +3 -3
- test/integration/connectors/test_milvus.py +10 -5
- test/integration/connectors/test_mongodb.py +9 -9
- test/integration/connectors/test_neo4j.py +16 -8
- test/integration/connectors/test_notion.py +7 -0
- test/integration/connectors/test_onedrive.py +2 -4
- test/integration/connectors/test_pinecone.py +73 -8
- test/integration/connectors/test_qdrant.py +5 -4
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +7 -6
- test/integration/connectors/test_vectara.py +2 -2
- test/integration/connectors/utils/constants.py +6 -0
- test/integration/connectors/utils/docker.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +5 -0
- test/integration/connectors/weaviate/test_local.py +2 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
- unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -9
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/METADATA +20 -18
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/RECORD +47 -44
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
import pytest
|
|
7
7
|
|
|
8
8
|
from test.integration.connectors.utils.constants import (
|
|
9
|
+
BLOB_STORAGE_TAG,
|
|
9
10
|
DESTINATION_TAG,
|
|
10
11
|
SOURCE_TAG,
|
|
11
12
|
env_setup_path,
|
|
@@ -47,7 +48,7 @@ def anon_connection_config() -> S3ConnectionConfig:
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
@pytest.mark.asyncio
|
|
50
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
51
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
51
52
|
async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
52
53
|
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
|
|
53
54
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
@@ -70,7 +71,7 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
|
70
71
|
|
|
71
72
|
|
|
72
73
|
@pytest.mark.asyncio
|
|
73
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
74
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
74
75
|
async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
|
|
75
76
|
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
|
|
76
77
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
@@ -92,7 +93,7 @@ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig
|
|
|
92
93
|
)
|
|
93
94
|
|
|
94
95
|
|
|
95
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
96
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
96
97
|
def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
97
98
|
indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
|
|
98
99
|
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
@@ -100,7 +101,7 @@ def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
|
100
101
|
indexer.precheck()
|
|
101
102
|
|
|
102
103
|
|
|
103
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
104
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
104
105
|
def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
|
|
105
106
|
indexer_config = S3IndexerConfig(remote_url="s3://fake-bucket")
|
|
106
107
|
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
@@ -109,7 +110,7 @@ def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
|
|
|
109
110
|
|
|
110
111
|
|
|
111
112
|
@pytest.mark.asyncio
|
|
112
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
|
|
113
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio", BLOB_STORAGE_TAG)
|
|
113
114
|
async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
|
|
114
115
|
anon_connection_config.endpoint_url = "http://localhost:9000"
|
|
115
116
|
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
|
|
@@ -149,7 +150,7 @@ def get_aws_credentials() -> dict:
|
|
|
149
150
|
|
|
150
151
|
|
|
151
152
|
@pytest.mark.asyncio
|
|
152
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
153
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
153
154
|
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
154
155
|
async def test_s3_destination(upload_file: Path):
|
|
155
156
|
aws_credentials = get_aws_credentials()
|
|
@@ -8,7 +8,7 @@ from uuid import uuid4
|
|
|
8
8
|
import pytest
|
|
9
9
|
import requests
|
|
10
10
|
|
|
11
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
|
|
12
12
|
from test.integration.utils import requires_env
|
|
13
13
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
14
14
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -211,7 +211,7 @@ def corpora_util() -> Generator[str, None, None]:
|
|
|
211
211
|
|
|
212
212
|
|
|
213
213
|
@pytest.mark.asyncio
|
|
214
|
-
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
|
|
214
|
+
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
|
|
215
215
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
216
216
|
async def test_vectara_destination(
|
|
217
217
|
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
|
|
@@ -2,6 +2,12 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
SOURCE_TAG = "source"
|
|
4
4
|
DESTINATION_TAG = "destination"
|
|
5
|
+
BLOB_STORAGE_TAG = "blob_storage"
|
|
6
|
+
SQL_TAG = "sql"
|
|
7
|
+
NOSQL_TAG = "nosql"
|
|
8
|
+
VECTOR_DB_TAG = "vector_db"
|
|
9
|
+
GRAPH_DB_TAG = "graph_db"
|
|
10
|
+
UNCATEGORIZED_TAG = "uncategorized"
|
|
5
11
|
|
|
6
12
|
env_setup_path = Path(__file__).parents[1] / "env_setup"
|
|
7
13
|
expected_results_path = Path(__file__).parents[1] / "expected_results"
|
|
@@ -44,7 +44,7 @@ def get_container(
|
|
|
44
44
|
docker_client: docker.DockerClient,
|
|
45
45
|
image: str,
|
|
46
46
|
ports: dict,
|
|
47
|
-
name: Optional[str] =
|
|
47
|
+
name: Optional[str] = None,
|
|
48
48
|
environment: Optional[dict] = None,
|
|
49
49
|
volumes: Optional[dict] = None,
|
|
50
50
|
healthcheck: Optional[HealthCheck] = None,
|
|
@@ -115,7 +115,7 @@ def container_context(
|
|
|
115
115
|
healthcheck: Optional[HealthCheck] = None,
|
|
116
116
|
healthcheck_retries: int = 30,
|
|
117
117
|
docker_client: Optional[docker.DockerClient] = None,
|
|
118
|
-
name: Optional[str] =
|
|
118
|
+
name: Optional[str] = None,
|
|
119
119
|
):
|
|
120
120
|
docker_client = docker_client or docker.from_env()
|
|
121
121
|
print(f"pulling image {image}")
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import pytest
|
|
2
2
|
from pydantic import ValidationError
|
|
3
3
|
|
|
4
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
4
5
|
from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
|
|
6
|
+
CONNECTOR_TYPE,
|
|
5
7
|
CloudWeaviateAccessConfig,
|
|
6
8
|
CloudWeaviateConnectionConfig,
|
|
7
9
|
)
|
|
8
10
|
|
|
9
11
|
|
|
12
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
10
13
|
def test_weaviate_failing_connection_config():
|
|
11
14
|
with pytest.raises(ValidationError):
|
|
12
15
|
CloudWeaviateConnectionConfig(
|
|
@@ -16,6 +19,7 @@ def test_weaviate_failing_connection_config():
|
|
|
16
19
|
)
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
19
23
|
def test_weaviate_connection_config_happy_path():
|
|
20
24
|
CloudWeaviateConnectionConfig(
|
|
21
25
|
access_config=CloudWeaviateAccessConfig(
|
|
@@ -25,6 +29,7 @@ def test_weaviate_connection_config_happy_path():
|
|
|
25
29
|
)
|
|
26
30
|
|
|
27
31
|
|
|
32
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
28
33
|
def test_weaviate_connection_config_anonymous():
|
|
29
34
|
CloudWeaviateConnectionConfig(
|
|
30
35
|
access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
|
|
@@ -7,7 +7,7 @@ import requests
|
|
|
7
7
|
import weaviate
|
|
8
8
|
from weaviate.client import WeaviateClient
|
|
9
9
|
|
|
10
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
11
11
|
from test.integration.connectors.utils.docker import container_context
|
|
12
12
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
13
|
from unstructured_ingest.v2.processes.connectors.weaviate.local import (
|
|
@@ -74,7 +74,7 @@ def run_uploader_and_validate(
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
@pytest.mark.asyncio
|
|
77
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
77
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
78
78
|
def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
|
|
79
79
|
file_data = FileData(
|
|
80
80
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.15" # pragma: no cover
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
from abc import ABC
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, TypeVar
|
|
5
|
+
from typing import Any, Optional, TypeVar
|
|
6
6
|
|
|
7
7
|
import ndjson
|
|
8
8
|
from pydantic import BaseModel
|
|
@@ -22,10 +22,10 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
22
22
|
class UploadStager(BaseProcess, ABC):
|
|
23
23
|
upload_stager_config: UploadStagerConfigT
|
|
24
24
|
|
|
25
|
-
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
25
|
+
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
26
26
|
if output_path.suffix == ".json":
|
|
27
27
|
with output_path.open("w") as f:
|
|
28
|
-
json.dump(data, f, indent=
|
|
28
|
+
json.dump(data, f, indent=indent)
|
|
29
29
|
elif output_path.suffix == ".ndjson":
|
|
30
30
|
with output_path.open("w") as f:
|
|
31
31
|
ndjson.dump(data, f)
|
|
@@ -25,6 +25,8 @@ from .volumes_native import (
|
|
|
25
25
|
databricks_native_volumes_destination_entry,
|
|
26
26
|
databricks_native_volumes_source_entry,
|
|
27
27
|
)
|
|
28
|
+
from .volumes_table import CONNECTOR_TYPE as VOLUMES_TABLE_CONNECTOR_TYPE
|
|
29
|
+
from .volumes_table import databricks_volumes_delta_tables_destination_entry
|
|
28
30
|
|
|
29
31
|
add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
|
|
30
32
|
add_destination_entry(
|
|
@@ -50,3 +52,7 @@ add_source_entry(
|
|
|
50
52
|
add_destination_entry(
|
|
51
53
|
destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
|
|
52
54
|
)
|
|
55
|
+
add_destination_entry(
|
|
56
|
+
destination_type=VOLUMES_TABLE_CONNECTOR_TYPE,
|
|
57
|
+
entry=databricks_volumes_delta_tables_destination_entry,
|
|
58
|
+
)
|
|
@@ -187,6 +187,11 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
187
187
|
upload_config: DatabricksVolumesUploaderConfig
|
|
188
188
|
connection_config: DatabricksVolumesConnectionConfig
|
|
189
189
|
|
|
190
|
+
def get_output_path(self, file_data: FileData) -> str:
|
|
191
|
+
return os.path.join(
|
|
192
|
+
self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
|
|
193
|
+
)
|
|
194
|
+
|
|
190
195
|
def precheck(self) -> None:
|
|
191
196
|
try:
|
|
192
197
|
assert self.connection_config.get_client().current_user.me().active
|
|
@@ -194,9 +199,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
194
199
|
raise self.connection_config.wrap_error(e=e)
|
|
195
200
|
|
|
196
201
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
197
|
-
output_path =
|
|
198
|
-
self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
|
|
199
|
-
)
|
|
202
|
+
output_path = self.get_output_path(file_data=file_data)
|
|
200
203
|
with open(path, "rb") as elements_file:
|
|
201
204
|
try:
|
|
202
205
|
self.connection_config.get_client().files.upload(
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
|
|
11
|
+
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
|
+
DestinationRegistryEntry,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
16
|
+
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
17
|
+
DatabrickDeltaTablesConnectionConfig,
|
|
18
|
+
DatabrickDeltaTablesUploadStager,
|
|
19
|
+
DatabrickDeltaTablesUploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
26
|
+
database: str = Field(description="Database name", default="default")
|
|
27
|
+
table_name: str = Field(description="Table name")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class DatabricksVolumeDeltaTableStager(DatabrickDeltaTablesUploadStager):
|
|
32
|
+
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
33
|
+
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
34
|
+
# and always write it as a json file
|
|
35
|
+
with output_path.with_suffix(".json").open("w") as f:
|
|
36
|
+
json.dump(data, f)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
41
|
+
connection_config: DatabrickDeltaTablesConnectionConfig
|
|
42
|
+
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
|
+
connector_type: str = CONNECTOR_TYPE
|
|
44
|
+
|
|
45
|
+
def precheck(self) -> None:
|
|
46
|
+
with self.connection_config.get_cursor() as cursor:
|
|
47
|
+
cursor.execute("SHOW CATALOGS")
|
|
48
|
+
catalogs = [r[0] for r in cursor.fetchall()]
|
|
49
|
+
if self.upload_config.catalog not in catalogs:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"Catalog {} not found in {}".format(
|
|
52
|
+
self.upload_config.catalog, ", ".join(catalogs)
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
56
|
+
cursor.execute("SHOW DATABASES")
|
|
57
|
+
databases = [r[0] for r in cursor.fetchall()]
|
|
58
|
+
if self.upload_config.database not in databases:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
"Database {} not found in {}".format(
|
|
61
|
+
self.upload_config.database, ", ".join(databases)
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
cursor.execute("SHOW TABLES")
|
|
65
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
66
|
+
if self.upload_config.table_name not in table_names:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"Table {} not found in {}".format(
|
|
69
|
+
self.upload_config.table_name, ", ".join(table_names)
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
|
|
74
|
+
filename = Path(file_data.source_identifiers.filename)
|
|
75
|
+
adjusted_filename = filename if filename.suffix == suffix else f"{filename}{suffix}"
|
|
76
|
+
return os.path.join(self.upload_config.path, f"{adjusted_filename}")
|
|
77
|
+
|
|
78
|
+
@contextmanager
|
|
79
|
+
def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
|
|
80
|
+
with self.connection_config.get_cursor(**connect_kwargs) as cursor:
|
|
81
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
82
|
+
yield cursor
|
|
83
|
+
|
|
84
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
85
|
+
with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
|
|
86
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
87
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
88
|
+
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
89
|
+
logger.debug(
|
|
90
|
+
f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
|
|
91
|
+
)
|
|
92
|
+
with path.open() as f:
|
|
93
|
+
data = json.load(f)
|
|
94
|
+
columns = data[0].keys()
|
|
95
|
+
column_str = ", ".join(columns)
|
|
96
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
97
|
+
cursor.execute(sql_statment)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
101
|
+
connection_config=DatabrickDeltaTablesConnectionConfig,
|
|
102
|
+
uploader=DatabricksVolumeDeltaTableUploader,
|
|
103
|
+
uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
|
|
104
|
+
upload_stager=DatabricksVolumeDeltaTableStager,
|
|
105
|
+
upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
|
|
106
|
+
)
|
|
@@ -105,7 +105,7 @@ class Neo4jUploadStager(UploadStager):
|
|
|
105
105
|
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
106
106
|
|
|
107
107
|
with open(output_filepath, "w") as file:
|
|
108
|
-
|
|
108
|
+
file.write(_GraphData.from_nx(nx_graph).model_dump_json())
|
|
109
109
|
|
|
110
110
|
return output_filepath
|
|
111
111
|
|
|
@@ -196,7 +196,7 @@ class _GraphData(BaseModel):
|
|
|
196
196
|
|
|
197
197
|
|
|
198
198
|
class _Node(BaseModel):
|
|
199
|
-
model_config = ConfigDict(
|
|
199
|
+
model_config = ConfigDict()
|
|
200
200
|
|
|
201
201
|
id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
202
202
|
labels: list[Label] = Field(default_factory=list)
|
|
@@ -207,20 +207,20 @@ class _Node(BaseModel):
|
|
|
207
207
|
|
|
208
208
|
|
|
209
209
|
class _Edge(BaseModel):
|
|
210
|
-
model_config = ConfigDict(
|
|
210
|
+
model_config = ConfigDict()
|
|
211
211
|
|
|
212
212
|
source_id: str
|
|
213
213
|
destination_id: str
|
|
214
214
|
relationship: Relationship
|
|
215
215
|
|
|
216
216
|
|
|
217
|
-
class Label(
|
|
217
|
+
class Label(Enum):
|
|
218
218
|
UNSTRUCTURED_ELEMENT = "UnstructuredElement"
|
|
219
219
|
CHUNK = "Chunk"
|
|
220
220
|
DOCUMENT = "Document"
|
|
221
221
|
|
|
222
222
|
|
|
223
|
-
class Relationship(
|
|
223
|
+
class Relationship(Enum):
|
|
224
224
|
PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
|
|
225
225
|
PART_OF_CHUNK = "PART_OF_CHUNK"
|
|
226
226
|
NEXT_CHUNK = "NEXT_CHUNK"
|
|
@@ -263,14 +263,14 @@ class Neo4jUploader(Uploader):
|
|
|
263
263
|
async def _create_uniqueness_constraints(self, client: AsyncDriver) -> None:
|
|
264
264
|
for label in Label:
|
|
265
265
|
logger.info(
|
|
266
|
-
f"Adding id uniqueness constraint for nodes labeled '{label}'"
|
|
266
|
+
f"Adding id uniqueness constraint for nodes labeled '{label.value}'"
|
|
267
267
|
" if it does not already exist."
|
|
268
268
|
)
|
|
269
|
-
constraint_name = f"{label.lower()}_id"
|
|
269
|
+
constraint_name = f"{label.value.lower()}_id"
|
|
270
270
|
await client.execute_query(
|
|
271
271
|
f"""
|
|
272
272
|
CREATE CONSTRAINT {constraint_name} IF NOT EXISTS
|
|
273
|
-
FOR (n: {label}) REQUIRE n.id IS UNIQUE
|
|
273
|
+
FOR (n: {label.value}) REQUIRE n.id IS UNIQUE
|
|
274
274
|
"""
|
|
275
275
|
)
|
|
276
276
|
|
|
@@ -278,8 +278,8 @@ class Neo4jUploader(Uploader):
|
|
|
278
278
|
logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
|
|
279
279
|
_, summary, _ = await client.execute_query(
|
|
280
280
|
f"""
|
|
281
|
-
MATCH (n: {Label.DOCUMENT} {{id: $identifier}})
|
|
282
|
-
MATCH (n)--(m: {Label.CHUNK}|{Label.UNSTRUCTURED_ELEMENT})
|
|
281
|
+
MATCH (n: {Label.DOCUMENT.value} {{id: $identifier}})
|
|
282
|
+
MATCH (n)--(m: {Label.CHUNK.value}|{Label.UNSTRUCTURED_ELEMENT.value})
|
|
283
283
|
DETACH DELETE m""",
|
|
284
284
|
identifier=file_data.identifier,
|
|
285
285
|
)
|
|
@@ -349,7 +349,7 @@ class Neo4jUploader(Uploader):
|
|
|
349
349
|
|
|
350
350
|
@staticmethod
|
|
351
351
|
def _create_nodes_query(nodes: list[_Node], labels: tuple[Label, ...]) -> tuple[str, dict]:
|
|
352
|
-
labels_string = ", ".join(labels)
|
|
352
|
+
labels_string = ", ".join([label.value for label in labels])
|
|
353
353
|
logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{labels_string}'.")
|
|
354
354
|
query_string = f"""
|
|
355
355
|
UNWIND $nodes AS node
|
|
@@ -366,7 +366,7 @@ class Neo4jUploader(Uploader):
|
|
|
366
366
|
UNWIND $edges AS edge
|
|
367
367
|
MATCH (u {{id: edge.source}})
|
|
368
368
|
MATCH (v {{id: edge.destination}})
|
|
369
|
-
MERGE (u)-[:{relationship}]->(v)
|
|
369
|
+
MERGE (u)-[:{relationship.value}]->(v)
|
|
370
370
|
"""
|
|
371
371
|
parameters = {
|
|
372
372
|
"edges": [
|
|
@@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
5
5
|
from pydantic import Field, Secret
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.error import DestinationConnectionError
|
|
8
|
-
from unstructured_ingest.utils.data_prep import
|
|
9
|
-
flatten_dict,
|
|
10
|
-
generator_batching_wbytes,
|
|
11
|
-
)
|
|
8
|
+
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
12
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
10
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
11
|
+
from unstructured_ingest.v2.errors import UserError
|
|
14
12
|
from unstructured_ingest.v2.interfaces import (
|
|
15
13
|
AccessConfig,
|
|
16
14
|
ConnectionConfig,
|
|
@@ -63,6 +61,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
63
61
|
pc = self.get_client()
|
|
64
62
|
|
|
65
63
|
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
64
|
+
|
|
66
65
|
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
67
66
|
return index
|
|
68
67
|
|
|
@@ -182,14 +181,18 @@ class PineconeUploader(Uploader):
|
|
|
182
181
|
delete_kwargs = {
|
|
183
182
|
"filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
184
183
|
}
|
|
184
|
+
|
|
185
185
|
if namespace := self.upload_config.namespace:
|
|
186
186
|
delete_kwargs["namespace"] = namespace
|
|
187
|
+
try:
|
|
188
|
+
index.delete(**delete_kwargs)
|
|
189
|
+
except UserError as e:
|
|
190
|
+
logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
|
|
187
191
|
|
|
188
|
-
resp = index.delete(**delete_kwargs)
|
|
189
192
|
logger.debug(
|
|
190
193
|
f"deleted any content with metadata "
|
|
191
194
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
192
|
-
f"from pinecone index: {
|
|
195
|
+
f"from pinecone index: {delete_kwargs}"
|
|
193
196
|
)
|
|
194
197
|
|
|
195
198
|
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
@@ -203,15 +206,19 @@ class PineconeUploader(Uploader):
|
|
|
203
206
|
deleted_ids = 0
|
|
204
207
|
if namespace := self.upload_config.namespace:
|
|
205
208
|
list_kwargs["namespace"] = namespace
|
|
209
|
+
|
|
206
210
|
for ids in index.list(**list_kwargs):
|
|
207
211
|
deleted_ids += len(ids)
|
|
208
212
|
delete_kwargs = {"ids": ids}
|
|
213
|
+
|
|
209
214
|
if namespace := self.upload_config.namespace:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
+
delete_kwargs["namespace"] = namespace
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
index.delete(**delete_kwargs)
|
|
219
|
+
except UserError as e:
|
|
220
|
+
logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
|
|
221
|
+
|
|
215
222
|
logger.info(
|
|
216
223
|
f"deleted {deleted_ids} records with metadata "
|
|
217
224
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
@@ -5,6 +5,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
5
5
|
add_source_entry,
|
|
6
6
|
)
|
|
7
7
|
|
|
8
|
+
from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
|
|
9
|
+
from .databricks_delta_tables import databricks_delta_tables_destination_entry
|
|
8
10
|
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
9
11
|
from .postgres import postgres_destination_entry, postgres_source_entry
|
|
10
12
|
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
@@ -25,3 +27,7 @@ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake
|
|
|
25
27
|
add_destination_entry(
|
|
26
28
|
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
27
29
|
)
|
|
30
|
+
add_destination_entry(
|
|
31
|
+
destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
|
|
32
|
+
entry=databricks_delta_tables_destination_entry,
|
|
33
|
+
)
|