unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (47) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +10 -6
  2. test/integration/connectors/discord/test_discord.py +4 -4
  3. test/integration/connectors/duckdb/test_duckdb.py +3 -2
  4. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  5. test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
  6. test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
  7. test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
  8. test/integration/connectors/sql/test_postgres.py +9 -3
  9. test/integration/connectors/sql/test_singlestore.py +9 -3
  10. test/integration/connectors/sql/test_snowflake.py +9 -3
  11. test/integration/connectors/sql/test_sqlite.py +9 -3
  12. test/integration/connectors/test_astradb.py +25 -9
  13. test/integration/connectors/test_azure_ai_search.py +3 -4
  14. test/integration/connectors/test_chroma.py +4 -6
  15. test/integration/connectors/test_confluence.py +3 -5
  16. test/integration/connectors/test_delta_table.py +4 -6
  17. test/integration/connectors/test_lancedb.py +3 -3
  18. test/integration/connectors/test_milvus.py +10 -5
  19. test/integration/connectors/test_mongodb.py +9 -9
  20. test/integration/connectors/test_neo4j.py +16 -8
  21. test/integration/connectors/test_notion.py +7 -0
  22. test/integration/connectors/test_onedrive.py +2 -4
  23. test/integration/connectors/test_pinecone.py +73 -8
  24. test/integration/connectors/test_qdrant.py +5 -4
  25. test/integration/connectors/test_redis.py +3 -3
  26. test/integration/connectors/test_s3.py +7 -6
  27. test/integration/connectors/test_vectara.py +2 -2
  28. test/integration/connectors/utils/constants.py +6 -0
  29. test/integration/connectors/utils/docker.py +2 -2
  30. test/integration/connectors/weaviate/test_cloud.py +5 -0
  31. test/integration/connectors/weaviate/test_local.py +2 -2
  32. unstructured_ingest/__version__.py +1 -1
  33. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  34. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
  35. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
  36. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
  37. unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
  38. unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
  39. unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
  40. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
  41. unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -9
  42. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/METADATA +20 -18
  43. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/RECORD +47 -44
  44. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/LICENSE.md +0 -0
  45. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/WHEEL +0 -0
  46. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt +0 -0
  47. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  import pytest
7
7
 
8
8
  from test.integration.connectors.utils.constants import (
9
+ BLOB_STORAGE_TAG,
9
10
  DESTINATION_TAG,
10
11
  SOURCE_TAG,
11
12
  env_setup_path,
@@ -47,7 +48,7 @@ def anon_connection_config() -> S3ConnectionConfig:
47
48
 
48
49
 
49
50
  @pytest.mark.asyncio
50
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
51
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
51
52
  async def test_s3_source(anon_connection_config: S3ConnectionConfig):
52
53
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
53
54
  with tempfile.TemporaryDirectory() as tempdir:
@@ -70,7 +71,7 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
70
71
 
71
72
 
72
73
  @pytest.mark.asyncio
73
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
74
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
74
75
  async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
75
76
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
76
77
  with tempfile.TemporaryDirectory() as tempdir:
@@ -92,7 +93,7 @@ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig
92
93
  )
93
94
 
94
95
 
95
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
96
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
96
97
  def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
97
98
  indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
98
99
  indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
@@ -100,7 +101,7 @@ def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
100
101
  indexer.precheck()
101
102
 
102
103
 
103
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
104
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
104
105
  def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
105
106
  indexer_config = S3IndexerConfig(remote_url="s3://fake-bucket")
106
107
  indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
@@ -109,7 +110,7 @@ def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
109
110
 
110
111
 
111
112
  @pytest.mark.asyncio
112
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
113
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio", BLOB_STORAGE_TAG)
113
114
  async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
114
115
  anon_connection_config.endpoint_url = "http://localhost:9000"
115
116
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
@@ -149,7 +150,7 @@ def get_aws_credentials() -> dict:
149
150
 
150
151
 
151
152
  @pytest.mark.asyncio
152
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
153
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
153
154
  @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
154
155
  async def test_s3_destination(upload_file: Path):
155
156
  aws_credentials = get_aws_credentials()
@@ -8,7 +8,7 @@ from uuid import uuid4
8
8
  import pytest
9
9
  import requests
10
10
 
11
- from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
12
12
  from test.integration.utils import requires_env
13
13
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14
14
  from unstructured_ingest.v2.logger import logger
@@ -211,7 +211,7 @@ def corpora_util() -> Generator[str, None, None]:
211
211
 
212
212
 
213
213
  @pytest.mark.asyncio
214
- @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
214
+ @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
215
215
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
216
  async def test_vectara_destination(
217
217
  upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
@@ -2,6 +2,12 @@ from pathlib import Path
2
2
 
3
3
  SOURCE_TAG = "source"
4
4
  DESTINATION_TAG = "destination"
5
+ BLOB_STORAGE_TAG = "blob_storage"
6
+ SQL_TAG = "sql"
7
+ NOSQL_TAG = "nosql"
8
+ VECTOR_DB_TAG = "vector_db"
9
+ GRAPH_DB_TAG = "graph_db"
10
+ UNCATEGORIZED_TAG = "uncategorized"
5
11
 
6
12
  env_setup_path = Path(__file__).parents[1] / "env_setup"
7
13
  expected_results_path = Path(__file__).parents[1] / "expected_results"
@@ -44,7 +44,7 @@ def get_container(
44
44
  docker_client: docker.DockerClient,
45
45
  image: str,
46
46
  ports: dict,
47
- name: Optional[str] = "connector_test",
47
+ name: Optional[str] = None,
48
48
  environment: Optional[dict] = None,
49
49
  volumes: Optional[dict] = None,
50
50
  healthcheck: Optional[HealthCheck] = None,
@@ -115,7 +115,7 @@ def container_context(
115
115
  healthcheck: Optional[HealthCheck] = None,
116
116
  healthcheck_retries: int = 30,
117
117
  docker_client: Optional[docker.DockerClient] = None,
118
- name: Optional[str] = "connector_test",
118
+ name: Optional[str] = None,
119
119
  ):
120
120
  docker_client = docker_client or docker.from_env()
121
121
  print(f"pulling image {image}")
@@ -1,12 +1,15 @@
1
1
  import pytest
2
2
  from pydantic import ValidationError
3
3
 
4
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
4
5
  from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
6
+ CONNECTOR_TYPE,
5
7
  CloudWeaviateAccessConfig,
6
8
  CloudWeaviateConnectionConfig,
7
9
  )
8
10
 
9
11
 
12
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
10
13
  def test_weaviate_failing_connection_config():
11
14
  with pytest.raises(ValidationError):
12
15
  CloudWeaviateConnectionConfig(
@@ -16,6 +19,7 @@ def test_weaviate_failing_connection_config():
16
19
  )
17
20
 
18
21
 
22
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
19
23
  def test_weaviate_connection_config_happy_path():
20
24
  CloudWeaviateConnectionConfig(
21
25
  access_config=CloudWeaviateAccessConfig(
@@ -25,6 +29,7 @@ def test_weaviate_connection_config_happy_path():
25
29
  )
26
30
 
27
31
 
32
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
28
33
  def test_weaviate_connection_config_anonymous():
29
34
  CloudWeaviateConnectionConfig(
30
35
  access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
@@ -7,7 +7,7 @@ import requests
7
7
  import weaviate
8
8
  from weaviate.client import WeaviateClient
9
9
 
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
11
11
  from test.integration.connectors.utils.docker import container_context
12
12
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
13
  from unstructured_ingest.v2.processes.connectors.weaviate.local import (
@@ -74,7 +74,7 @@ def run_uploader_and_validate(
74
74
 
75
75
 
76
76
  @pytest.mark.asyncio
77
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
77
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
78
78
  def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
79
79
  file_data = FileData(
80
80
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -1 +1 @@
1
- __version__ = "0.3.13" # pragma: no cover
1
+ __version__ = "0.3.15" # pragma: no cover
@@ -2,7 +2,7 @@ import json
2
2
  from abc import ABC
3
3
  from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from typing import Any, TypeVar
5
+ from typing import Any, Optional, TypeVar
6
6
 
7
7
  import ndjson
8
8
  from pydantic import BaseModel
@@ -22,10 +22,10 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
22
22
  class UploadStager(BaseProcess, ABC):
23
23
  upload_stager_config: UploadStagerConfigT
24
24
 
25
- def write_output(self, output_path: Path, data: list[dict]) -> None:
25
+ def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
26
26
  if output_path.suffix == ".json":
27
27
  with output_path.open("w") as f:
28
- json.dump(data, f, indent=2)
28
+ json.dump(data, f, indent=indent)
29
29
  elif output_path.suffix == ".ndjson":
30
30
  with output_path.open("w") as f:
31
31
  ndjson.dump(data, f)
@@ -25,6 +25,8 @@ from .volumes_native import (
25
25
  databricks_native_volumes_destination_entry,
26
26
  databricks_native_volumes_source_entry,
27
27
  )
28
+ from .volumes_table import CONNECTOR_TYPE as VOLUMES_TABLE_CONNECTOR_TYPE
29
+ from .volumes_table import databricks_volumes_delta_tables_destination_entry
28
30
 
29
31
  add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
30
32
  add_destination_entry(
@@ -50,3 +52,7 @@ add_source_entry(
50
52
  add_destination_entry(
51
53
  destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
52
54
  )
55
+ add_destination_entry(
56
+ destination_type=VOLUMES_TABLE_CONNECTOR_TYPE,
57
+ entry=databricks_volumes_delta_tables_destination_entry,
58
+ )
@@ -187,6 +187,11 @@ class DatabricksVolumesUploader(Uploader, ABC):
187
187
  upload_config: DatabricksVolumesUploaderConfig
188
188
  connection_config: DatabricksVolumesConnectionConfig
189
189
 
190
+ def get_output_path(self, file_data: FileData) -> str:
191
+ return os.path.join(
192
+ self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
193
+ )
194
+
190
195
  def precheck(self) -> None:
191
196
  try:
192
197
  assert self.connection_config.get_client().current_user.me().active
@@ -194,9 +199,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
194
199
  raise self.connection_config.wrap_error(e=e)
195
200
 
196
201
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
197
- output_path = os.path.join(
198
- self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
199
- )
202
+ output_path = self.get_output_path(file_data=file_data)
200
203
  with open(path, "rb") as elements_file:
201
204
  try:
202
205
  self.connection_config.get_client().files.upload(
@@ -0,0 +1,106 @@
1
+ import json
2
+ import os
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any, Generator, Optional
7
+
8
+ from pydantic import Field
9
+
10
+ from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
11
+ from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.processes.connector_registry import (
13
+ DestinationRegistryEntry,
14
+ )
15
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
16
+ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
17
+ DatabrickDeltaTablesConnectionConfig,
18
+ DatabrickDeltaTablesUploadStager,
19
+ DatabrickDeltaTablesUploadStagerConfig,
20
+ )
21
+
22
+ CONNECTOR_TYPE = "databricks_volume_delta_tables"
23
+
24
+
25
+ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
26
+ database: str = Field(description="Database name", default="default")
27
+ table_name: str = Field(description="Table name")
28
+
29
+
30
+ @dataclass
31
+ class DatabricksVolumeDeltaTableStager(DatabrickDeltaTablesUploadStager):
32
+ def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
33
+ # To avoid new line issues when migrating from volumes into delta tables, omit indenting
34
+ # and always write it as a json file
35
+ with output_path.with_suffix(".json").open("w") as f:
36
+ json.dump(data, f)
37
+
38
+
39
+ @dataclass
40
+ class DatabricksVolumeDeltaTableUploader(Uploader):
41
+ connection_config: DatabrickDeltaTablesConnectionConfig
42
+ upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
+ connector_type: str = CONNECTOR_TYPE
44
+
45
+ def precheck(self) -> None:
46
+ with self.connection_config.get_cursor() as cursor:
47
+ cursor.execute("SHOW CATALOGS")
48
+ catalogs = [r[0] for r in cursor.fetchall()]
49
+ if self.upload_config.catalog not in catalogs:
50
+ raise ValueError(
51
+ "Catalog {} not found in {}".format(
52
+ self.upload_config.catalog, ", ".join(catalogs)
53
+ )
54
+ )
55
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
56
+ cursor.execute("SHOW DATABASES")
57
+ databases = [r[0] for r in cursor.fetchall()]
58
+ if self.upload_config.database not in databases:
59
+ raise ValueError(
60
+ "Database {} not found in {}".format(
61
+ self.upload_config.database, ", ".join(databases)
62
+ )
63
+ )
64
+ cursor.execute("SHOW TABLES")
65
+ table_names = [r[1] for r in cursor.fetchall()]
66
+ if self.upload_config.table_name not in table_names:
67
+ raise ValueError(
68
+ "Table {} not found in {}".format(
69
+ self.upload_config.table_name, ", ".join(table_names)
70
+ )
71
+ )
72
+
73
+ def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
74
+ filename = Path(file_data.source_identifiers.filename)
75
+ adjusted_filename = filename if filename.suffix == suffix else f"{filename}{suffix}"
76
+ return os.path.join(self.upload_config.path, f"{adjusted_filename}")
77
+
78
+ @contextmanager
79
+ def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
80
+ with self.connection_config.get_cursor(**connect_kwargs) as cursor:
81
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
82
+ yield cursor
83
+
84
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
85
+ with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
86
+ catalog_path = self.get_output_path(file_data=file_data)
87
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
88
+ cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
89
+ logger.debug(
90
+ f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
91
+ )
92
+ with path.open() as f:
93
+ data = json.load(f)
94
+ columns = data[0].keys()
95
+ column_str = ", ".join(columns)
96
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
97
+ cursor.execute(sql_statment)
98
+
99
+
100
+ databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
101
+ connection_config=DatabrickDeltaTablesConnectionConfig,
102
+ uploader=DatabricksVolumeDeltaTableUploader,
103
+ uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
104
+ upload_stager=DatabricksVolumeDeltaTableStager,
105
+ upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
106
+ )
@@ -105,7 +105,7 @@ class Neo4jUploadStager(UploadStager):
105
105
  output_filepath.parent.mkdir(parents=True, exist_ok=True)
106
106
 
107
107
  with open(output_filepath, "w") as file:
108
- json.dump(_GraphData.from_nx(nx_graph).model_dump(), file, indent=4)
108
+ file.write(_GraphData.from_nx(nx_graph).model_dump_json())
109
109
 
110
110
  return output_filepath
111
111
 
@@ -196,7 +196,7 @@ class _GraphData(BaseModel):
196
196
 
197
197
 
198
198
  class _Node(BaseModel):
199
- model_config = ConfigDict(use_enum_values=True)
199
+ model_config = ConfigDict()
200
200
 
201
201
  id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
202
202
  labels: list[Label] = Field(default_factory=list)
@@ -207,20 +207,20 @@ class _Node(BaseModel):
207
207
 
208
208
 
209
209
  class _Edge(BaseModel):
210
- model_config = ConfigDict(use_enum_values=True)
210
+ model_config = ConfigDict()
211
211
 
212
212
  source_id: str
213
213
  destination_id: str
214
214
  relationship: Relationship
215
215
 
216
216
 
217
- class Label(str, Enum):
217
+ class Label(Enum):
218
218
  UNSTRUCTURED_ELEMENT = "UnstructuredElement"
219
219
  CHUNK = "Chunk"
220
220
  DOCUMENT = "Document"
221
221
 
222
222
 
223
- class Relationship(str, Enum):
223
+ class Relationship(Enum):
224
224
  PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
225
225
  PART_OF_CHUNK = "PART_OF_CHUNK"
226
226
  NEXT_CHUNK = "NEXT_CHUNK"
@@ -263,14 +263,14 @@ class Neo4jUploader(Uploader):
263
263
  async def _create_uniqueness_constraints(self, client: AsyncDriver) -> None:
264
264
  for label in Label:
265
265
  logger.info(
266
- f"Adding id uniqueness constraint for nodes labeled '{label}'"
266
+ f"Adding id uniqueness constraint for nodes labeled '{label.value}'"
267
267
  " if it does not already exist."
268
268
  )
269
- constraint_name = f"{label.lower()}_id"
269
+ constraint_name = f"{label.value.lower()}_id"
270
270
  await client.execute_query(
271
271
  f"""
272
272
  CREATE CONSTRAINT {constraint_name} IF NOT EXISTS
273
- FOR (n: {label}) REQUIRE n.id IS UNIQUE
273
+ FOR (n: {label.value}) REQUIRE n.id IS UNIQUE
274
274
  """
275
275
  )
276
276
 
@@ -278,8 +278,8 @@ class Neo4jUploader(Uploader):
278
278
  logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
279
279
  _, summary, _ = await client.execute_query(
280
280
  f"""
281
- MATCH (n: {Label.DOCUMENT} {{id: $identifier}})
282
- MATCH (n)--(m: {Label.CHUNK}|{Label.UNSTRUCTURED_ELEMENT})
281
+ MATCH (n: {Label.DOCUMENT.value} {{id: $identifier}})
282
+ MATCH (n)--(m: {Label.CHUNK.value}|{Label.UNSTRUCTURED_ELEMENT.value})
283
283
  DETACH DELETE m""",
284
284
  identifier=file_data.identifier,
285
285
  )
@@ -349,7 +349,7 @@ class Neo4jUploader(Uploader):
349
349
 
350
350
  @staticmethod
351
351
  def _create_nodes_query(nodes: list[_Node], labels: tuple[Label, ...]) -> tuple[str, dict]:
352
- labels_string = ", ".join(labels)
352
+ labels_string = ", ".join([label.value for label in labels])
353
353
  logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{labels_string}'.")
354
354
  query_string = f"""
355
355
  UNWIND $nodes AS node
@@ -366,7 +366,7 @@ class Neo4jUploader(Uploader):
366
366
  UNWIND $edges AS edge
367
367
  MATCH (u {{id: edge.source}})
368
368
  MATCH (v {{id: edge.destination}})
369
- MERGE (u)-[:{relationship}]->(v)
369
+ MERGE (u)-[:{relationship.value}]->(v)
370
370
  """
371
371
  parameters = {
372
372
  "edges": [
@@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Optional
5
5
  from pydantic import Field, Secret
6
6
 
7
7
  from unstructured_ingest.error import DestinationConnectionError
8
- from unstructured_ingest.utils.data_prep import (
9
- flatten_dict,
10
- generator_batching_wbytes,
11
- )
8
+ from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
12
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
10
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
11
+ from unstructured_ingest.v2.errors import UserError
14
12
  from unstructured_ingest.v2.interfaces import (
15
13
  AccessConfig,
16
14
  ConnectionConfig,
@@ -63,6 +61,7 @@ class PineconeConnectionConfig(ConnectionConfig):
63
61
  pc = self.get_client()
64
62
 
65
63
  index = pc.Index(name=self.index_name, **index_kwargs)
64
+
66
65
  logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
67
66
  return index
68
67
 
@@ -182,14 +181,18 @@ class PineconeUploader(Uploader):
182
181
  delete_kwargs = {
183
182
  "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
184
183
  }
184
+
185
185
  if namespace := self.upload_config.namespace:
186
186
  delete_kwargs["namespace"] = namespace
187
+ try:
188
+ index.delete(**delete_kwargs)
189
+ except UserError as e:
190
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
187
191
 
188
- resp = index.delete(**delete_kwargs)
189
192
  logger.debug(
190
193
  f"deleted any content with metadata "
191
194
  f"{self.upload_config.record_id_key}={file_data.identifier} "
192
- f"from pinecone index: {resp}"
195
+ f"from pinecone index: {delete_kwargs}"
193
196
  )
194
197
 
195
198
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
@@ -203,15 +206,19 @@ class PineconeUploader(Uploader):
203
206
  deleted_ids = 0
204
207
  if namespace := self.upload_config.namespace:
205
208
  list_kwargs["namespace"] = namespace
209
+
206
210
  for ids in index.list(**list_kwargs):
207
211
  deleted_ids += len(ids)
208
212
  delete_kwargs = {"ids": ids}
213
+
209
214
  if namespace := self.upload_config.namespace:
210
- delete_resp = delete_kwargs["namespace"] = namespace
211
- # delete_resp should be an empty dict if there were no errors
212
- if delete_resp:
213
- logger.error(f"failed to delete batch of ids: {delete_resp}")
214
- index.delete(**delete_kwargs)
215
+ delete_kwargs["namespace"] = namespace
216
+
217
+ try:
218
+ index.delete(**delete_kwargs)
219
+ except UserError as e:
220
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
221
+
215
222
  logger.info(
216
223
  f"deleted {deleted_ids} records with metadata "
217
224
  f"{self.upload_config.record_id_key}={file_data.identifier} "
@@ -5,6 +5,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
5
5
  add_source_entry,
6
6
  )
7
7
 
8
+ from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
9
+ from .databricks_delta_tables import databricks_delta_tables_destination_entry
8
10
  from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
9
11
  from .postgres import postgres_destination_entry, postgres_source_entry
10
12
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
@@ -25,3 +27,7 @@ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake
25
27
  add_destination_entry(
26
28
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
27
29
  )
30
+ add_destination_entry(
31
+ destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
32
+ entry=databricks_delta_tables_destination_entry,
33
+ )