unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
- test/integration/connectors/sql/test_postgres.py +10 -4
- test/integration/connectors/sql/test_singlestore.py +8 -4
- test/integration/connectors/sql/test_snowflake.py +10 -6
- test/integration/connectors/sql/test_sqlite.py +4 -4
- test/integration/connectors/test_astradb.py +50 -3
- test/integration/connectors/test_delta_table.py +46 -0
- test/integration/connectors/test_kafka.py +40 -6
- test/integration/connectors/test_lancedb.py +209 -0
- test/integration/connectors/test_milvus.py +141 -0
- test/integration/connectors/test_pinecone.py +53 -1
- test/integration/connectors/utils/docker.py +81 -15
- test/integration/connectors/utils/validation.py +10 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
- unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
- unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
- unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +15 -15
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +50 -30
- unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import re
|
|
3
4
|
import time
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Generator
|
|
5
7
|
from uuid import uuid4
|
|
6
8
|
|
|
7
9
|
import pytest
|
|
@@ -12,6 +14,7 @@ from test.integration.connectors.utils.constants import (
|
|
|
12
14
|
DESTINATION_TAG,
|
|
13
15
|
)
|
|
14
16
|
from test.integration.utils import requires_env
|
|
17
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
15
18
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
16
19
|
from unstructured_ingest.v2.logger import logger
|
|
17
20
|
from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
@@ -24,6 +27,12 @@ from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
|
24
27
|
PineconeUploadStagerConfig,
|
|
25
28
|
)
|
|
26
29
|
|
|
30
|
+
METADATA_BYTES_LIMIT = (
|
|
31
|
+
40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
|
|
32
|
+
)
|
|
33
|
+
VECTOR_DIMENSION = 384
|
|
34
|
+
SPEC = {"serverless": {"cloud": "aws", "region": "us-east-1"}}
|
|
35
|
+
ALLOWED_METADATA_FIELD = "text"
|
|
27
36
|
API_KEY = "PINECONE_API_KEY"
|
|
28
37
|
|
|
29
38
|
|
|
@@ -62,7 +71,7 @@ def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) ->
|
|
|
62
71
|
|
|
63
72
|
|
|
64
73
|
@pytest.fixture
|
|
65
|
-
def pinecone_index() -> str:
|
|
74
|
+
def pinecone_index() -> Generator[str, None, None]:
|
|
66
75
|
pinecone = Pinecone(api_key=get_api_key())
|
|
67
76
|
random_id = str(uuid4()).split("-")[0]
|
|
68
77
|
index_name = f"ingest-test-{random_id}"
|
|
@@ -159,3 +168,46 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
|
|
|
159
168
|
validate_pinecone_index(
|
|
160
169
|
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
161
170
|
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@requires_env(API_KEY)
|
|
174
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
175
|
+
def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
176
|
+
stager = PineconeUploadStager()
|
|
177
|
+
uploader = PineconeUploader(
|
|
178
|
+
connection_config=PineconeConnectionConfig(
|
|
179
|
+
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
180
|
+
index_name=pinecone_index,
|
|
181
|
+
),
|
|
182
|
+
upload_config=PineconeUploaderConfig(),
|
|
183
|
+
)
|
|
184
|
+
large_metadata_upload_file = tmp_path / "mock-upload-file.pdf.json"
|
|
185
|
+
large_metadata = {ALLOWED_METADATA_FIELD: "0" * 2 * METADATA_BYTES_LIMIT}
|
|
186
|
+
|
|
187
|
+
with open(upload_file) as file:
|
|
188
|
+
elements = json.load(file)
|
|
189
|
+
|
|
190
|
+
with open(large_metadata_upload_file, "w") as file:
|
|
191
|
+
mock_element = elements[0]
|
|
192
|
+
mock_element["metadata"] = large_metadata
|
|
193
|
+
json.dump([mock_element], file)
|
|
194
|
+
|
|
195
|
+
file_data = FileData(
|
|
196
|
+
source_identifiers=SourceIdentifiers(
|
|
197
|
+
fullpath=large_metadata_upload_file.name, filename=large_metadata_upload_file.name
|
|
198
|
+
),
|
|
199
|
+
connector_type=CONNECTOR_TYPE,
|
|
200
|
+
identifier="mock-file-data",
|
|
201
|
+
)
|
|
202
|
+
staged_file = stager.run(
|
|
203
|
+
file_data, large_metadata_upload_file, tmp_path, large_metadata_upload_file.name
|
|
204
|
+
)
|
|
205
|
+
try:
|
|
206
|
+
uploader.run(staged_file, file_data)
|
|
207
|
+
except DestinationConnectionError as e:
|
|
208
|
+
error_line = r"Metadata size is \d+ bytes, which exceeds the limit of \d+ bytes per vector"
|
|
209
|
+
if re.search(re.compile(error_line), str(e)) is None:
|
|
210
|
+
raise e
|
|
211
|
+
raise pytest.fail("Upload request failed due to metadata exceeding limits.")
|
|
212
|
+
|
|
213
|
+
validate_pinecone_index(pinecone_index, 1, interval=5)
|
|
@@ -1,9 +1,43 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from contextlib import contextmanager
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
import docker
|
|
6
6
|
from docker.models.containers import Container
|
|
7
|
+
from pydantic import BaseModel, Field, field_serializer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HealthCheck(BaseModel):
|
|
11
|
+
test: Union[str, list[str]]
|
|
12
|
+
interval: int = Field(
|
|
13
|
+
gt=0, default=30, description="The time to wait between checks in seconds."
|
|
14
|
+
)
|
|
15
|
+
timeout: int = Field(
|
|
16
|
+
gt=0, default=30, description="The time to wait before considering the check to have hung."
|
|
17
|
+
)
|
|
18
|
+
retries: int = Field(
|
|
19
|
+
gt=0,
|
|
20
|
+
default=3,
|
|
21
|
+
description="The number of consecutive failures needed "
|
|
22
|
+
"to consider a container as unhealthy.",
|
|
23
|
+
)
|
|
24
|
+
start_period: int = Field(
|
|
25
|
+
gt=0,
|
|
26
|
+
default=0,
|
|
27
|
+
description="Start period for the container to initialize before starting health-retries countdown in seconds.", # noqa: E501
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
@field_serializer("interval")
|
|
31
|
+
def serialize_interval(self, interval: int) -> int:
|
|
32
|
+
return int(interval * 10e8)
|
|
33
|
+
|
|
34
|
+
@field_serializer("timeout")
|
|
35
|
+
def serialize_timeout(self, timeout: int) -> int:
|
|
36
|
+
return int(timeout * 10e8)
|
|
37
|
+
|
|
38
|
+
@field_serializer("start_period")
|
|
39
|
+
def serialize_start_period(self, start_period: int) -> int:
|
|
40
|
+
return int(start_period * 10e8)
|
|
7
41
|
|
|
8
42
|
|
|
9
43
|
def get_container(
|
|
@@ -12,7 +46,7 @@ def get_container(
|
|
|
12
46
|
ports: dict,
|
|
13
47
|
environment: Optional[dict] = None,
|
|
14
48
|
volumes: Optional[dict] = None,
|
|
15
|
-
healthcheck: Optional[
|
|
49
|
+
healthcheck: Optional[HealthCheck] = None,
|
|
16
50
|
) -> Container:
|
|
17
51
|
run_kwargs = {
|
|
18
52
|
"image": image,
|
|
@@ -24,25 +58,49 @@ def get_container(
|
|
|
24
58
|
if volumes:
|
|
25
59
|
run_kwargs["volumes"] = volumes
|
|
26
60
|
if healthcheck:
|
|
27
|
-
run_kwargs["healthcheck"] = healthcheck
|
|
61
|
+
run_kwargs["healthcheck"] = healthcheck.model_dump()
|
|
28
62
|
container: Container = docker_client.containers.run(**run_kwargs)
|
|
29
63
|
return container
|
|
30
64
|
|
|
31
65
|
|
|
32
|
-
def
|
|
33
|
-
|
|
66
|
+
def get_healthcheck(container: Container) -> Optional[HealthCheck]:
|
|
67
|
+
healthcheck_config = container.attrs.get("Config", {}).get("Healthcheck", None)
|
|
68
|
+
if not healthcheck_config:
|
|
69
|
+
return None
|
|
70
|
+
healthcheck_data = {
|
|
71
|
+
"test": healthcheck_config["Test"],
|
|
72
|
+
}
|
|
73
|
+
if interval := healthcheck_config.get("Interval"):
|
|
74
|
+
healthcheck_data["interval"] = interval / 10e8
|
|
75
|
+
if start_period := healthcheck_config.get("StartPeriod"):
|
|
76
|
+
healthcheck_data["start_period"] = start_period / 10e8
|
|
77
|
+
if retries := healthcheck_config.get("Retries"):
|
|
78
|
+
healthcheck_data["retries"] = retries
|
|
79
|
+
return HealthCheck.model_validate(healthcheck_data)
|
|
34
80
|
|
|
35
81
|
|
|
36
|
-
def healthcheck_wait(
|
|
82
|
+
def healthcheck_wait(
|
|
83
|
+
container: Container, retries: int = 30, interval: int = 1, start_period: Optional[int] = None
|
|
84
|
+
) -> None:
|
|
85
|
+
if start_period:
|
|
86
|
+
time.sleep(start_period)
|
|
37
87
|
health = container.health
|
|
38
|
-
|
|
39
|
-
while health != "healthy" and
|
|
40
|
-
|
|
88
|
+
tries = 0
|
|
89
|
+
while health != "healthy" and tries < retries:
|
|
90
|
+
tries += 1
|
|
91
|
+
logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
|
|
92
|
+
latest_log = logs[-1] if logs else None
|
|
93
|
+
print(
|
|
94
|
+
f"attempt {tries} - waiting for docker container "
|
|
95
|
+
f"to be healthy: {health} latest log: {latest_log}"
|
|
96
|
+
)
|
|
97
|
+
time.sleep(interval)
|
|
41
98
|
container.reload()
|
|
42
99
|
health = container.health
|
|
43
100
|
if health != "healthy":
|
|
44
|
-
|
|
45
|
-
|
|
101
|
+
logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
|
|
102
|
+
latest_log = logs[-1] if logs else None
|
|
103
|
+
raise TimeoutError(f"Docker container never came up healthy: {latest_log}")
|
|
46
104
|
|
|
47
105
|
|
|
48
106
|
@contextmanager
|
|
@@ -51,11 +109,13 @@ def container_context(
|
|
|
51
109
|
ports: dict,
|
|
52
110
|
environment: Optional[dict] = None,
|
|
53
111
|
volumes: Optional[dict] = None,
|
|
54
|
-
healthcheck: Optional[
|
|
55
|
-
|
|
112
|
+
healthcheck: Optional[HealthCheck] = None,
|
|
113
|
+
healthcheck_retries: int = 30,
|
|
56
114
|
docker_client: Optional[docker.DockerClient] = None,
|
|
57
115
|
):
|
|
58
116
|
docker_client = docker_client or docker.from_env()
|
|
117
|
+
print(f"pulling image {image}")
|
|
118
|
+
docker_client.images.pull(image)
|
|
59
119
|
container: Optional[Container] = None
|
|
60
120
|
try:
|
|
61
121
|
container = get_container(
|
|
@@ -66,8 +126,14 @@ def container_context(
|
|
|
66
126
|
volumes=volumes,
|
|
67
127
|
healthcheck=healthcheck,
|
|
68
128
|
)
|
|
69
|
-
if
|
|
70
|
-
|
|
129
|
+
if healthcheck_data := get_healthcheck(container):
|
|
130
|
+
# Mirror whatever healthcheck config set on container
|
|
131
|
+
healthcheck_wait(
|
|
132
|
+
container=container,
|
|
133
|
+
retries=healthcheck_retries,
|
|
134
|
+
start_period=healthcheck_data.start_period,
|
|
135
|
+
interval=healthcheck_data.interval,
|
|
136
|
+
)
|
|
71
137
|
yield container
|
|
72
138
|
except AssertionError as e:
|
|
73
139
|
if container:
|
|
@@ -240,6 +240,10 @@ def update_fixtures(
|
|
|
240
240
|
# Rewrite the current file data
|
|
241
241
|
if save_filedata:
|
|
242
242
|
file_data_output_path = output_dir / "file_data"
|
|
243
|
+
print(
|
|
244
|
+
f"Writing {len(all_file_data)} file data to "
|
|
245
|
+
f"saved fixture location {file_data_output_path}"
|
|
246
|
+
)
|
|
243
247
|
file_data_output_path.mkdir(parents=True, exist_ok=True)
|
|
244
248
|
for file_data in all_file_data:
|
|
245
249
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
@@ -256,6 +260,10 @@ def update_fixtures(
|
|
|
256
260
|
# If applicable, save raw downloads
|
|
257
261
|
if save_downloads:
|
|
258
262
|
raw_download_output_path = output_dir / "downloads"
|
|
263
|
+
print(
|
|
264
|
+
f"Writing {len(download_files)} downloaded files to "
|
|
265
|
+
f"saved fixture location {raw_download_output_path}"
|
|
266
|
+
)
|
|
259
267
|
shutil.copytree(download_dir, raw_download_output_path)
|
|
260
268
|
|
|
261
269
|
|
|
@@ -328,6 +336,7 @@ async def source_connector_validation(
|
|
|
328
336
|
postdownload_file_data = replace(resp["file_data"])
|
|
329
337
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
330
338
|
if not overwrite_fixtures:
|
|
339
|
+
print("Running validation")
|
|
331
340
|
run_all_validations(
|
|
332
341
|
configs=configs,
|
|
333
342
|
predownload_file_data=all_predownload_file_data,
|
|
@@ -336,6 +345,7 @@ async def source_connector_validation(
|
|
|
336
345
|
test_output_dir=test_output_dir,
|
|
337
346
|
)
|
|
338
347
|
else:
|
|
348
|
+
print("Running fixtures update")
|
|
339
349
|
update_fixtures(
|
|
340
350
|
output_dir=test_output_dir,
|
|
341
351
|
download_dir=download_dir,
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def collections_schema_config() -> dict:
|
|
9
|
+
int_test_dir = Path(__file__).parent
|
|
10
|
+
assets_dir = int_test_dir / "assets"
|
|
11
|
+
config_file = assets_dir / "elements.json"
|
|
12
|
+
assert config_file.exists()
|
|
13
|
+
assert config_file.is_file()
|
|
14
|
+
with config_file.open() as config_data:
|
|
15
|
+
return json.load(config_data)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import requests
|
|
7
|
+
import weaviate
|
|
8
|
+
from weaviate.client import WeaviateClient
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
|
+
from test.integration.connectors.utils.docker import container_context
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.weaviate.local import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
LocalWeaviateConnectionConfig,
|
|
16
|
+
LocalWeaviateUploader,
|
|
17
|
+
LocalWeaviateUploaderConfig,
|
|
18
|
+
LocalWeaviateUploadStager,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
COLLECTION_NAME = "elements"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
25
|
+
start_time = time.time()
|
|
26
|
+
while time.time() - start_time < timeout:
|
|
27
|
+
try:
|
|
28
|
+
requests.get("http://localhost:8080/v1/.well-known/read")
|
|
29
|
+
return
|
|
30
|
+
except Exception as e:
|
|
31
|
+
print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
|
|
32
|
+
time.sleep(interval)
|
|
33
|
+
raise TimeoutError("Docker container never came up healthy")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.fixture
|
|
37
|
+
def collection(collections_schema_config: dict) -> str:
|
|
38
|
+
with container_context(
|
|
39
|
+
image="semitechnologies/weaviate:1.27.3",
|
|
40
|
+
ports={8080: 8080, 50051: 50051},
|
|
41
|
+
):
|
|
42
|
+
wait_for_container()
|
|
43
|
+
with weaviate.connect_to_local() as weaviate_client:
|
|
44
|
+
weaviate_client.collections.create_from_dict(config=collections_schema_config)
|
|
45
|
+
yield COLLECTION_NAME
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_count(client: WeaviateClient) -> int:
|
|
49
|
+
collection = client.collections.get(COLLECTION_NAME)
|
|
50
|
+
resp = collection.aggregate.over_all(total_count=True)
|
|
51
|
+
return resp.total_count
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
|
|
55
|
+
with weaviate.connect_to_local() as weaviate_client:
|
|
56
|
+
current_count = get_count(client=weaviate_client)
|
|
57
|
+
retry_count = 0
|
|
58
|
+
while current_count != expected_count and retry_count < retries:
|
|
59
|
+
retry_count += 1
|
|
60
|
+
time.sleep(interval)
|
|
61
|
+
current_count = get_count(client=weaviate_client)
|
|
62
|
+
assert current_count == expected_count, (
|
|
63
|
+
f"Expected count ({expected_count}) doesn't match how "
|
|
64
|
+
f"much came back from collection: {current_count}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_uploader_and_validate(
|
|
69
|
+
uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
|
|
70
|
+
):
|
|
71
|
+
uploader.precheck()
|
|
72
|
+
uploader.run(path=path, file_data=file_data)
|
|
73
|
+
validate_count(expected_count=expected_count)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@pytest.mark.asyncio
|
|
77
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
78
|
+
def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
|
|
79
|
+
file_data = FileData(
|
|
80
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
81
|
+
connector_type=CONNECTOR_TYPE,
|
|
82
|
+
identifier="mock file data",
|
|
83
|
+
)
|
|
84
|
+
stager = LocalWeaviateUploadStager()
|
|
85
|
+
|
|
86
|
+
staged_filepath = stager.run(
|
|
87
|
+
elements_filepath=upload_file,
|
|
88
|
+
file_data=file_data,
|
|
89
|
+
output_dir=tmp_path,
|
|
90
|
+
output_filename=upload_file.name,
|
|
91
|
+
)
|
|
92
|
+
dynamic_uploader = LocalWeaviateUploader(
|
|
93
|
+
upload_config=LocalWeaviateUploaderConfig(
|
|
94
|
+
collection=COLLECTION_NAME,
|
|
95
|
+
),
|
|
96
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
97
|
+
)
|
|
98
|
+
fixed_size_uploader = LocalWeaviateUploader(
|
|
99
|
+
upload_config=LocalWeaviateUploaderConfig(
|
|
100
|
+
collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
|
|
101
|
+
),
|
|
102
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
103
|
+
)
|
|
104
|
+
rate_limited_uploader = LocalWeaviateUploader(
|
|
105
|
+
upload_config=LocalWeaviateUploaderConfig(
|
|
106
|
+
collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
|
|
107
|
+
),
|
|
108
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
109
|
+
)
|
|
110
|
+
with staged_filepath.open() as f:
|
|
111
|
+
staged_elements = json.load(f)
|
|
112
|
+
expected_count = len(staged_elements)
|
|
113
|
+
|
|
114
|
+
run_uploader_and_validate(
|
|
115
|
+
uploader=dynamic_uploader,
|
|
116
|
+
path=staged_filepath,
|
|
117
|
+
file_data=file_data,
|
|
118
|
+
expected_count=expected_count,
|
|
119
|
+
)
|
|
120
|
+
run_uploader_and_validate(
|
|
121
|
+
uploader=fixed_size_uploader,
|
|
122
|
+
path=staged_filepath,
|
|
123
|
+
file_data=file_data,
|
|
124
|
+
expected_count=expected_count,
|
|
125
|
+
)
|
|
126
|
+
run_uploader_and_validate(
|
|
127
|
+
uploader=rate_limited_uploader,
|
|
128
|
+
path=staged_filepath,
|
|
129
|
+
file_data=file_data,
|
|
130
|
+
expected_count=expected_count,
|
|
131
|
+
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.1" # pragma: no cover
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
|
|
4
|
+
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
5
7
|
|
|
6
8
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
7
9
|
|
|
@@ -9,6 +11,12 @@ T = TypeVar("T")
|
|
|
9
11
|
IterableT = Iterable[T]
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
|
|
15
|
+
num_chunks = len(df) // chunk_size + 1
|
|
16
|
+
for i in range(num_chunks):
|
|
17
|
+
yield df[i * chunk_size : (i + 1) * chunk_size]
|
|
18
|
+
|
|
19
|
+
|
|
12
20
|
def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
|
|
13
21
|
"""A helper function to break an iterable into batches of size batch_size."""
|
|
14
22
|
it = iter(iterable)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
4
|
+
import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
|
|
4
5
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
6
|
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
7
|
+
import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
|
|
6
8
|
import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
|
|
7
9
|
import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
|
|
10
|
+
import unstructured_ingest.v2.processes.connectors.weaviate # noqa: F401
|
|
8
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
9
12
|
add_destination_entry,
|
|
10
13
|
add_source_entry,
|
|
@@ -24,8 +27,6 @@ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
|
24
27
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
25
28
|
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
26
29
|
from .delta_table import delta_table_destination_entry
|
|
27
|
-
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
28
|
-
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
29
30
|
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
30
31
|
from .gitlab import gitlab_source_entry
|
|
31
32
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
@@ -40,8 +41,6 @@ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
|
40
41
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
41
42
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
42
43
|
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
43
|
-
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
44
|
-
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
45
44
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
46
45
|
from .outlook import outlook_source_entry
|
|
47
46
|
from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
|
|
@@ -52,8 +51,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
|
52
51
|
from .sharepoint import sharepoint_source_entry
|
|
53
52
|
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
|
|
54
53
|
from .slack import slack_source_entry
|
|
55
|
-
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
56
|
-
from .weaviate import weaviate_destination_entry
|
|
57
54
|
|
|
58
55
|
add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
|
|
59
56
|
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
|
|
@@ -67,10 +64,6 @@ add_destination_entry(
|
|
|
67
64
|
destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
|
|
68
65
|
)
|
|
69
66
|
|
|
70
|
-
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
71
|
-
add_destination_entry(
|
|
72
|
-
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
73
|
-
)
|
|
74
67
|
|
|
75
68
|
add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
|
|
76
69
|
|
|
@@ -80,15 +73,9 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
|
|
|
80
73
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
81
74
|
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
82
75
|
|
|
83
|
-
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
84
|
-
add_destination_entry(
|
|
85
|
-
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
86
|
-
)
|
|
87
76
|
|
|
88
77
|
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
89
78
|
|
|
90
|
-
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
91
|
-
|
|
92
79
|
add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
|
|
93
80
|
add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
|
|
94
81
|
|
|
@@ -170,7 +170,7 @@ class AstraDBIndexer(Indexer):
|
|
|
170
170
|
|
|
171
171
|
def precheck(self) -> None:
|
|
172
172
|
try:
|
|
173
|
-
self.get_collection()
|
|
173
|
+
self.get_collection().options()
|
|
174
174
|
except Exception as e:
|
|
175
175
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
176
176
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -345,7 +345,7 @@ class AstraDBUploader(Uploader):
|
|
|
345
345
|
connection_config=self.connection_config,
|
|
346
346
|
collection_name=self.upload_config.collection_name,
|
|
347
347
|
keyspace=self.upload_config.keyspace,
|
|
348
|
-
)
|
|
348
|
+
).options()
|
|
349
349
|
except Exception as e:
|
|
350
350
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
351
351
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -155,6 +155,10 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
155
155
|
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
156
|
]
|
|
157
157
|
|
|
158
|
+
if Path(output_filename).suffix != ".json":
|
|
159
|
+
output_filename = f"{output_filename}.json"
|
|
160
|
+
else:
|
|
161
|
+
output_filename = f"{Path(output_filename).stem}.json"
|
|
158
162
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
159
163
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
160
164
|
with open(output_path, "w") as output_file:
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import traceback
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from multiprocessing import Process
|
|
5
|
+
from multiprocessing import Process, Queue
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Optional
|
|
7
8
|
from urllib.parse import urlparse
|
|
@@ -27,6 +28,15 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
27
28
|
CONNECTOR_TYPE = "delta_table"
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
32
|
+
from deltalake.writer import write_deltalake
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
write_deltalake(**kwargs)
|
|
36
|
+
except Exception:
|
|
37
|
+
queue.put(traceback.format_exc())
|
|
38
|
+
|
|
39
|
+
|
|
30
40
|
class DeltaTableAccessConfig(AccessConfig):
|
|
31
41
|
aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
|
|
32
42
|
aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
|
|
@@ -157,7 +167,6 @@ class DeltaTableUploader(Uploader):
|
|
|
157
167
|
|
|
158
168
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
159
169
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
160
|
-
from deltalake.writer import write_deltalake
|
|
161
170
|
|
|
162
171
|
df = self.read_dataframe(path)
|
|
163
172
|
updated_upload_path = os.path.join(
|
|
@@ -176,17 +185,24 @@ class DeltaTableUploader(Uploader):
|
|
|
176
185
|
"mode": "overwrite",
|
|
177
186
|
"storage_options": storage_options,
|
|
178
187
|
}
|
|
188
|
+
queue = Queue()
|
|
179
189
|
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
180
190
|
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
181
191
|
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
182
192
|
# rust backend to finish
|
|
183
193
|
writer = Process(
|
|
184
|
-
target=
|
|
185
|
-
kwargs=writer_kwargs,
|
|
194
|
+
target=write_deltalake_with_error_handling,
|
|
195
|
+
kwargs={"queue": queue, **writer_kwargs},
|
|
186
196
|
)
|
|
187
197
|
writer.start()
|
|
188
198
|
writer.join()
|
|
189
199
|
|
|
200
|
+
# Check if the queue has any exception message
|
|
201
|
+
if not queue.empty():
|
|
202
|
+
error_message = queue.get()
|
|
203
|
+
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
204
|
+
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
205
|
+
|
|
190
206
|
|
|
191
207
|
delta_table_destination_entry = DestinationRegistryEntry(
|
|
192
208
|
connection_config=DeltaTableConnectionConfig,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
2
|
+
add_destination_entry,
|
|
3
|
+
add_source_entry,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
7
|
+
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
8
|
+
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
9
|
+
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
10
|
+
|
|
11
|
+
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
12
|
+
add_destination_entry(
|
|
13
|
+
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
17
|
+
add_destination_entry(
|
|
18
|
+
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
19
|
+
)
|