unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (51) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +50 -3
  10. test/integration/connectors/test_delta_table.py +46 -0
  11. test/integration/connectors/test_kafka.py +40 -6
  12. test/integration/connectors/test_lancedb.py +209 -0
  13. test/integration/connectors/test_milvus.py +141 -0
  14. test/integration/connectors/test_pinecone.py +53 -1
  15. test/integration/connectors/utils/docker.py +81 -15
  16. test/integration/connectors/utils/validation.py +10 -0
  17. test/integration/connectors/weaviate/__init__.py +0 -0
  18. test/integration/connectors/weaviate/conftest.py +15 -0
  19. test/integration/connectors/weaviate/test_local.py +131 -0
  20. unstructured_ingest/__version__.py +1 -1
  21. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  22. unstructured_ingest/utils/data_prep.py +9 -1
  23. unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
  24. unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
  25. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
  26. unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
  27. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  28. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
  29. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  30. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
  31. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  32. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  33. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  34. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  35. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  36. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  37. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  38. unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
  39. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  40. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  41. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  42. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  43. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  44. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
  45. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +15 -15
  46. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +50 -30
  47. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  48. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
  49. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
  50. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
  51. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  import os
3
+ import re
3
4
  import time
4
5
  from pathlib import Path
6
+ from typing import Generator
5
7
  from uuid import uuid4
6
8
 
7
9
  import pytest
@@ -12,6 +14,7 @@ from test.integration.connectors.utils.constants import (
12
14
  DESTINATION_TAG,
13
15
  )
14
16
  from test.integration.utils import requires_env
17
+ from unstructured_ingest.error import DestinationConnectionError
15
18
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
16
19
  from unstructured_ingest.v2.logger import logger
17
20
  from unstructured_ingest.v2.processes.connectors.pinecone import (
@@ -24,6 +27,12 @@ from unstructured_ingest.v2.processes.connectors.pinecone import (
24
27
  PineconeUploadStagerConfig,
25
28
  )
26
29
 
30
+ METADATA_BYTES_LIMIT = (
31
+ 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
32
+ )
33
+ VECTOR_DIMENSION = 384
34
+ SPEC = {"serverless": {"cloud": "aws", "region": "us-east-1"}}
35
+ ALLOWED_METADATA_FIELD = "text"
27
36
  API_KEY = "PINECONE_API_KEY"
28
37
 
29
38
 
@@ -62,7 +71,7 @@ def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) ->
62
71
 
63
72
 
64
73
  @pytest.fixture
65
- def pinecone_index() -> str:
74
+ def pinecone_index() -> Generator[str, None, None]:
66
75
  pinecone = Pinecone(api_key=get_api_key())
67
76
  random_id = str(uuid4()).split("-")[0]
68
77
  index_name = f"ingest-test-{random_id}"
@@ -159,3 +168,46 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
159
168
  validate_pinecone_index(
160
169
  index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
161
170
  )
171
+
172
+
173
+ @requires_env(API_KEY)
174
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
175
+ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
176
+ stager = PineconeUploadStager()
177
+ uploader = PineconeUploader(
178
+ connection_config=PineconeConnectionConfig(
179
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
180
+ index_name=pinecone_index,
181
+ ),
182
+ upload_config=PineconeUploaderConfig(),
183
+ )
184
+ large_metadata_upload_file = tmp_path / "mock-upload-file.pdf.json"
185
+ large_metadata = {ALLOWED_METADATA_FIELD: "0" * 2 * METADATA_BYTES_LIMIT}
186
+
187
+ with open(upload_file) as file:
188
+ elements = json.load(file)
189
+
190
+ with open(large_metadata_upload_file, "w") as file:
191
+ mock_element = elements[0]
192
+ mock_element["metadata"] = large_metadata
193
+ json.dump([mock_element], file)
194
+
195
+ file_data = FileData(
196
+ source_identifiers=SourceIdentifiers(
197
+ fullpath=large_metadata_upload_file.name, filename=large_metadata_upload_file.name
198
+ ),
199
+ connector_type=CONNECTOR_TYPE,
200
+ identifier="mock-file-data",
201
+ )
202
+ staged_file = stager.run(
203
+ file_data, large_metadata_upload_file, tmp_path, large_metadata_upload_file.name
204
+ )
205
+ try:
206
+ uploader.run(staged_file, file_data)
207
+ except DestinationConnectionError as e:
208
+ error_line = r"Metadata size is \d+ bytes, which exceeds the limit of \d+ bytes per vector"
209
+ if re.search(re.compile(error_line), str(e)) is None:
210
+ raise e
211
+ raise pytest.fail("Upload request failed due to metadata exceeding limits.")
212
+
213
+ validate_pinecone_index(pinecone_index, 1, interval=5)
@@ -1,9 +1,43 @@
1
1
  import time
2
2
  from contextlib import contextmanager
3
- from typing import Optional
3
+ from typing import Optional, Union
4
4
 
5
5
  import docker
6
6
  from docker.models.containers import Container
7
+ from pydantic import BaseModel, Field, field_serializer
8
+
9
+
10
+ class HealthCheck(BaseModel):
11
+ test: Union[str, list[str]]
12
+ interval: int = Field(
13
+ gt=0, default=30, description="The time to wait between checks in seconds."
14
+ )
15
+ timeout: int = Field(
16
+ gt=0, default=30, description="The time to wait before considering the check to have hung."
17
+ )
18
+ retries: int = Field(
19
+ gt=0,
20
+ default=3,
21
+ description="The number of consecutive failures needed "
22
+ "to consider a container as unhealthy.",
23
+ )
24
+ start_period: int = Field(
25
+ gt=0,
26
+ default=0,
27
+ description="Start period for the container to initialize before starting health-retries countdown in seconds.", # noqa: E501
28
+ )
29
+
30
+ @field_serializer("interval")
31
+ def serialize_interval(self, interval: int) -> int:
32
+ return int(interval * 10e8)
33
+
34
+ @field_serializer("timeout")
35
+ def serialize_timeout(self, timeout: int) -> int:
36
+ return int(timeout * 10e8)
37
+
38
+ @field_serializer("start_period")
39
+ def serialize_start_period(self, start_period: int) -> int:
40
+ return int(start_period * 10e8)
7
41
 
8
42
 
9
43
  def get_container(
@@ -12,7 +46,7 @@ def get_container(
12
46
  ports: dict,
13
47
  environment: Optional[dict] = None,
14
48
  volumes: Optional[dict] = None,
15
- healthcheck: Optional[dict] = None,
49
+ healthcheck: Optional[HealthCheck] = None,
16
50
  ) -> Container:
17
51
  run_kwargs = {
18
52
  "image": image,
@@ -24,25 +58,49 @@ def get_container(
24
58
  if volumes:
25
59
  run_kwargs["volumes"] = volumes
26
60
  if healthcheck:
27
- run_kwargs["healthcheck"] = healthcheck
61
+ run_kwargs["healthcheck"] = healthcheck.model_dump()
28
62
  container: Container = docker_client.containers.run(**run_kwargs)
29
63
  return container
30
64
 
31
65
 
32
- def has_healthcheck(container: Container) -> bool:
33
- return container.attrs.get("Config", {}).get("Healthcheck", None) is not None
66
+ def get_healthcheck(container: Container) -> Optional[HealthCheck]:
67
+ healthcheck_config = container.attrs.get("Config", {}).get("Healthcheck", None)
68
+ if not healthcheck_config:
69
+ return None
70
+ healthcheck_data = {
71
+ "test": healthcheck_config["Test"],
72
+ }
73
+ if interval := healthcheck_config.get("Interval"):
74
+ healthcheck_data["interval"] = interval / 10e8
75
+ if start_period := healthcheck_config.get("StartPeriod"):
76
+ healthcheck_data["start_period"] = start_period / 10e8
77
+ if retries := healthcheck_config.get("Retries"):
78
+ healthcheck_data["retries"] = retries
79
+ return HealthCheck.model_validate(healthcheck_data)
34
80
 
35
81
 
36
- def healthcheck_wait(container: Container, timeout: int = 10) -> None:
82
+ def healthcheck_wait(
83
+ container: Container, retries: int = 30, interval: int = 1, start_period: Optional[int] = None
84
+ ) -> None:
85
+ if start_period:
86
+ time.sleep(start_period)
37
87
  health = container.health
38
- start = time.time()
39
- while health != "healthy" and time.time() - start < timeout:
40
- time.sleep(1)
88
+ tries = 0
89
+ while health != "healthy" and tries < retries:
90
+ tries += 1
91
+ logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
92
+ latest_log = logs[-1] if logs else None
93
+ print(
94
+ f"attempt {tries} - waiting for docker container "
95
+ f"to be healthy: {health} latest log: {latest_log}"
96
+ )
97
+ time.sleep(interval)
41
98
  container.reload()
42
99
  health = container.health
43
100
  if health != "healthy":
44
- health_dict = container.attrs.get("State", {}).get("Health", {})
45
- raise TimeoutError(f"Docker container never came up healthy: {health_dict}")
101
+ logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
102
+ latest_log = logs[-1] if logs else None
103
+ raise TimeoutError(f"Docker container never came up healthy: {latest_log}")
46
104
 
47
105
 
48
106
  @contextmanager
@@ -51,11 +109,13 @@ def container_context(
51
109
  ports: dict,
52
110
  environment: Optional[dict] = None,
53
111
  volumes: Optional[dict] = None,
54
- healthcheck: Optional[dict] = None,
55
- healthcheck_timeout: int = 10,
112
+ healthcheck: Optional[HealthCheck] = None,
113
+ healthcheck_retries: int = 30,
56
114
  docker_client: Optional[docker.DockerClient] = None,
57
115
  ):
58
116
  docker_client = docker_client or docker.from_env()
117
+ print(f"pulling image {image}")
118
+ docker_client.images.pull(image)
59
119
  container: Optional[Container] = None
60
120
  try:
61
121
  container = get_container(
@@ -66,8 +126,14 @@ def container_context(
66
126
  volumes=volumes,
67
127
  healthcheck=healthcheck,
68
128
  )
69
- if has_healthcheck(container):
70
- healthcheck_wait(container=container, timeout=healthcheck_timeout)
129
+ if healthcheck_data := get_healthcheck(container):
130
+ # Mirror whatever healthcheck config set on container
131
+ healthcheck_wait(
132
+ container=container,
133
+ retries=healthcheck_retries,
134
+ start_period=healthcheck_data.start_period,
135
+ interval=healthcheck_data.interval,
136
+ )
71
137
  yield container
72
138
  except AssertionError as e:
73
139
  if container:
@@ -240,6 +240,10 @@ def update_fixtures(
240
240
  # Rewrite the current file data
241
241
  if save_filedata:
242
242
  file_data_output_path = output_dir / "file_data"
243
+ print(
244
+ f"Writing {len(all_file_data)} file data to "
245
+ f"saved fixture location {file_data_output_path}"
246
+ )
243
247
  file_data_output_path.mkdir(parents=True, exist_ok=True)
244
248
  for file_data in all_file_data:
245
249
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
@@ -256,6 +260,10 @@ def update_fixtures(
256
260
  # If applicable, save raw downloads
257
261
  if save_downloads:
258
262
  raw_download_output_path = output_dir / "downloads"
263
+ print(
264
+ f"Writing {len(download_files)} downloaded files to "
265
+ f"saved fixture location {raw_download_output_path}"
266
+ )
259
267
  shutil.copytree(download_dir, raw_download_output_path)
260
268
 
261
269
 
@@ -328,6 +336,7 @@ async def source_connector_validation(
328
336
  postdownload_file_data = replace(resp["file_data"])
329
337
  all_postdownload_file_data.append(postdownload_file_data)
330
338
  if not overwrite_fixtures:
339
+ print("Running validation")
331
340
  run_all_validations(
332
341
  configs=configs,
333
342
  predownload_file_data=all_predownload_file_data,
@@ -336,6 +345,7 @@ async def source_connector_validation(
336
345
  test_output_dir=test_output_dir,
337
346
  )
338
347
  else:
348
+ print("Running fixtures update")
339
349
  update_fixtures(
340
350
  output_dir=test_output_dir,
341
351
  download_dir=download_dir,
File without changes
@@ -0,0 +1,15 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+
7
+ @pytest.fixture
8
+ def collections_schema_config() -> dict:
9
+ int_test_dir = Path(__file__).parent
10
+ assets_dir = int_test_dir / "assets"
11
+ config_file = assets_dir / "elements.json"
12
+ assert config_file.exists()
13
+ assert config_file.is_file()
14
+ with config_file.open() as config_data:
15
+ return json.load(config_data)
@@ -0,0 +1,131 @@
1
+ import json
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ import requests
7
+ import weaviate
8
+ from weaviate.client import WeaviateClient
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.docker import container_context
12
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.weaviate.local import (
14
+ CONNECTOR_TYPE,
15
+ LocalWeaviateConnectionConfig,
16
+ LocalWeaviateUploader,
17
+ LocalWeaviateUploaderConfig,
18
+ LocalWeaviateUploadStager,
19
+ )
20
+
21
+ COLLECTION_NAME = "elements"
22
+
23
+
24
+ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
25
+ start_time = time.time()
26
+ while time.time() - start_time < timeout:
27
+ try:
28
+ requests.get("http://localhost:8080/v1/.well-known/read")
29
+ return
30
+ except Exception as e:
31
+ print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
32
+ time.sleep(interval)
33
+ raise TimeoutError("Docker container never came up healthy")
34
+
35
+
36
+ @pytest.fixture
37
+ def collection(collections_schema_config: dict) -> str:
38
+ with container_context(
39
+ image="semitechnologies/weaviate:1.27.3",
40
+ ports={8080: 8080, 50051: 50051},
41
+ ):
42
+ wait_for_container()
43
+ with weaviate.connect_to_local() as weaviate_client:
44
+ weaviate_client.collections.create_from_dict(config=collections_schema_config)
45
+ yield COLLECTION_NAME
46
+
47
+
48
+ def get_count(client: WeaviateClient) -> int:
49
+ collection = client.collections.get(COLLECTION_NAME)
50
+ resp = collection.aggregate.over_all(total_count=True)
51
+ return resp.total_count
52
+
53
+
54
+ def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
55
+ with weaviate.connect_to_local() as weaviate_client:
56
+ current_count = get_count(client=weaviate_client)
57
+ retry_count = 0
58
+ while current_count != expected_count and retry_count < retries:
59
+ retry_count += 1
60
+ time.sleep(interval)
61
+ current_count = get_count(client=weaviate_client)
62
+ assert current_count == expected_count, (
63
+ f"Expected count ({expected_count}) doesn't match how "
64
+ f"much came back from collection: {current_count}"
65
+ )
66
+
67
+
68
+ def run_uploader_and_validate(
69
+ uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
70
+ ):
71
+ uploader.precheck()
72
+ uploader.run(path=path, file_data=file_data)
73
+ validate_count(expected_count=expected_count)
74
+
75
+
76
+ @pytest.mark.asyncio
77
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
78
+ def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
79
+ file_data = FileData(
80
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
81
+ connector_type=CONNECTOR_TYPE,
82
+ identifier="mock file data",
83
+ )
84
+ stager = LocalWeaviateUploadStager()
85
+
86
+ staged_filepath = stager.run(
87
+ elements_filepath=upload_file,
88
+ file_data=file_data,
89
+ output_dir=tmp_path,
90
+ output_filename=upload_file.name,
91
+ )
92
+ dynamic_uploader = LocalWeaviateUploader(
93
+ upload_config=LocalWeaviateUploaderConfig(
94
+ collection=COLLECTION_NAME,
95
+ ),
96
+ connection_config=LocalWeaviateConnectionConfig(),
97
+ )
98
+ fixed_size_uploader = LocalWeaviateUploader(
99
+ upload_config=LocalWeaviateUploaderConfig(
100
+ collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
101
+ ),
102
+ connection_config=LocalWeaviateConnectionConfig(),
103
+ )
104
+ rate_limited_uploader = LocalWeaviateUploader(
105
+ upload_config=LocalWeaviateUploaderConfig(
106
+ collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
107
+ ),
108
+ connection_config=LocalWeaviateConnectionConfig(),
109
+ )
110
+ with staged_filepath.open() as f:
111
+ staged_elements = json.load(f)
112
+ expected_count = len(staged_elements)
113
+
114
+ run_uploader_and_validate(
115
+ uploader=dynamic_uploader,
116
+ path=staged_filepath,
117
+ file_data=file_data,
118
+ expected_count=expected_count,
119
+ )
120
+ run_uploader_and_validate(
121
+ uploader=fixed_size_uploader,
122
+ path=staged_filepath,
123
+ file_data=file_data,
124
+ expected_count=expected_count,
125
+ )
126
+ run_uploader_and_validate(
127
+ uploader=rate_limited_uploader,
128
+ path=staged_filepath,
129
+ file_data=file_data,
130
+ expected_count=expected_count,
131
+ )
@@ -1 +1 @@
1
- __version__ = "0.3.0" # pragma: no cover
1
+ __version__ = "0.3.1" # pragma: no cover
@@ -61,4 +61,4 @@ class Embedder(ReformatNode):
61
61
  return None
62
62
 
63
63
  def get_path(self) -> Path:
64
- return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
64
+ return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()
@@ -1,7 +1,9 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
- from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
4
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
+
6
+ import pandas as pd
5
7
 
6
8
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
7
9
 
@@ -9,6 +11,12 @@ T = TypeVar("T")
9
11
  IterableT = Iterable[T]
10
12
 
11
13
 
14
+ def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
15
+ num_chunks = len(df) // chunk_size + 1
16
+ for i in range(num_chunks):
17
+ yield df[i * chunk_size : (i + 1) * chunk_size]
18
+
19
+
12
20
  def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
13
21
  """A helper function to break an iterable into batches of size batch_size."""
14
22
  it = iter(iterable)
@@ -1,10 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
+ import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
4
5
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
6
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
7
+ import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
6
8
  import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
7
9
  import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
10
+ import unstructured_ingest.v2.processes.connectors.weaviate # noqa: F401
8
11
  from unstructured_ingest.v2.processes.connector_registry import (
9
12
  add_destination_entry,
10
13
  add_source_entry,
@@ -24,8 +27,6 @@ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
24
27
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
25
28
  from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
26
29
  from .delta_table import delta_table_destination_entry
27
- from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
28
- from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
29
30
  from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
30
31
  from .gitlab import gitlab_source_entry
31
32
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -40,8 +41,6 @@ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
40
41
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
41
42
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
42
43
  from .onedrive import onedrive_destination_entry, onedrive_source_entry
43
- from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
44
- from .opensearch import opensearch_destination_entry, opensearch_source_entry
45
44
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
46
45
  from .outlook import outlook_source_entry
47
46
  from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
@@ -52,8 +51,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
52
51
  from .sharepoint import sharepoint_source_entry
53
52
  from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
54
53
  from .slack import slack_source_entry
55
- from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
56
- from .weaviate import weaviate_destination_entry
57
54
 
58
55
  add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
59
56
  add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -67,10 +64,6 @@ add_destination_entry(
67
64
  destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
68
65
  )
69
66
 
70
- add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
71
- add_destination_entry(
72
- destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
73
- )
74
67
 
75
68
  add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
76
69
 
@@ -80,15 +73,9 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
80
73
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
81
74
  add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
82
75
 
83
- add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
84
- add_destination_entry(
85
- destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
86
- )
87
76
 
88
77
  add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
89
78
 
90
- add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
91
-
92
79
  add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
93
80
  add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
94
81
 
@@ -170,7 +170,7 @@ class AstraDBIndexer(Indexer):
170
170
 
171
171
  def precheck(self) -> None:
172
172
  try:
173
- self.get_collection()
173
+ self.get_collection().options()
174
174
  except Exception as e:
175
175
  logger.error(f"Failed to validate connection {e}", exc_info=True)
176
176
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -345,7 +345,7 @@ class AstraDBUploader(Uploader):
345
345
  connection_config=self.connection_config,
346
346
  collection_name=self.upload_config.collection_name,
347
347
  keyspace=self.upload_config.keyspace,
348
- )
348
+ ).options()
349
349
  except Exception as e:
350
350
  logger.error(f"Failed to validate connection {e}", exc_info=True)
351
351
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -155,6 +155,10 @@ class AzureAISearchUploadStager(UploadStager):
155
155
  self.conform_dict(data=element, file_data=file_data) for element in elements_contents
156
156
  ]
157
157
 
158
+ if Path(output_filename).suffix != ".json":
159
+ output_filename = f"{output_filename}.json"
160
+ else:
161
+ output_filename = f"{Path(output_filename).stem}.json"
158
162
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
159
163
  output_path.parent.mkdir(parents=True, exist_ok=True)
160
164
  with open(output_path, "w") as output_file:
@@ -1,7 +1,8 @@
1
1
  import json
2
2
  import os
3
+ import traceback
3
4
  from dataclasses import dataclass, field
4
- from multiprocessing import Process
5
+ from multiprocessing import Process, Queue
5
6
  from pathlib import Path
6
7
  from typing import Any, Optional
7
8
  from urllib.parse import urlparse
@@ -27,6 +28,15 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
27
28
  CONNECTOR_TYPE = "delta_table"
28
29
 
29
30
 
31
+ def write_deltalake_with_error_handling(queue, **kwargs):
32
+ from deltalake.writer import write_deltalake
33
+
34
+ try:
35
+ write_deltalake(**kwargs)
36
+ except Exception:
37
+ queue.put(traceback.format_exc())
38
+
39
+
30
40
  class DeltaTableAccessConfig(AccessConfig):
31
41
  aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
32
42
  aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
@@ -157,7 +167,6 @@ class DeltaTableUploader(Uploader):
157
167
 
158
168
  @requires_dependencies(["deltalake"], extras="delta-table")
159
169
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
160
- from deltalake.writer import write_deltalake
161
170
 
162
171
  df = self.read_dataframe(path)
163
172
  updated_upload_path = os.path.join(
@@ -176,17 +185,24 @@ class DeltaTableUploader(Uploader):
176
185
  "mode": "overwrite",
177
186
  "storage_options": storage_options,
178
187
  }
188
+ queue = Queue()
179
189
  # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
180
190
  # ingest to fail, even though all tasks are completed normally. Putting the writer into a
181
191
  # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
182
192
  # rust backend to finish
183
193
  writer = Process(
184
- target=write_deltalake,
185
- kwargs=writer_kwargs,
194
+ target=write_deltalake_with_error_handling,
195
+ kwargs={"queue": queue, **writer_kwargs},
186
196
  )
187
197
  writer.start()
188
198
  writer.join()
189
199
 
200
+ # Check if the queue has any exception message
201
+ if not queue.empty():
202
+ error_message = queue.get()
203
+ logger.error(f"Exception occurred in write_deltalake: {error_message}")
204
+ raise RuntimeError(f"Error in write_deltalake: {error_message}")
205
+
190
206
 
191
207
  delta_table_destination_entry = DestinationRegistryEntry(
192
208
  connection_config=DeltaTableConnectionConfig,
@@ -0,0 +1,19 @@
1
+ from unstructured_ingest.v2.processes.connector_registry import (
2
+ add_destination_entry,
3
+ add_source_entry,
4
+ )
5
+
6
+ from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
7
+ from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
8
+ from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
9
+ from .opensearch import opensearch_destination_entry, opensearch_source_entry
10
+
11
+ add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
12
+ add_destination_entry(
13
+ destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
14
+ )
15
+
16
+ add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
17
+ add_destination_entry(
18
+ destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
19
+ )